睡前故事python-油猴中文网 - Powered by Discuz! Archiver

陈公子的话 发表于 2020-12-3 15:10:58

睡前故事python

就很糟心写完了才看到有个一个专门的库去处理这种问题会简单很多。。。

过几天重写一下⑧

先记录一下这个代码
import requests
import re
def get_stroy():
url = 'http://book.sbkk8.com/gushihui/taijiaogushi/'

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'}
res = requests.get(url=url,headers=headers)
res.encoding = 'gbk'
name_list1 = re.compile(r'.html" target="_blank">(.*?)</a>').findall(res.text)
#print(name_list1)
name_list=[]
#爬出来的名字中有的有其他的符号在这里我们对他进行替换掉
for i in range(len(name_list1)):
   a=name_list1.replace('<b>','')
   b=a.replace('</b>','')
   name_list.append(b)
print(name_list)#到这里我们就得到了正确的故事标题 name_list
#我们开始获取URL链接
url_list1 = re.compile(r'mululist"> <a href="(.*?)" target="_blank">').findall(res.text)
print(url_list1)
url_list=[]
for i in range(len(url_list1)):#循环对url进行拼接
   url_list.append(url_list1.replace('/gushihui/','http://book.sbkk8.com/gushihui/'))
print(url_list)
#到这里我们就有了所有的标题和url链接，
#进入url取出故事内容
story=[]
for i in range(len(url_list)):
   url=url_list
   res_story = requests.get(url)
   res_story.encoding = 'gbk'
   # print(res_story.text)
   res_story = re.compile(
         r'</span><p>.*\s.*\s.*\s.*\s.*\s.*\s.*\s.*\s.*\s.*\s.*\s.*\s.*\s.*\s.*\s.*\s.*\s.*\s.*\s.*\s.*\s.*\s.*\s.*\s.*').findall(
         res_story.text)
   # print(res_story)
   a = res_story.replace('<br style="color: rgb(34, 34, 34); font-family: sans-serif, 宋体, 微软雅黑, arial, ', '')
   a = a.replace('<u>一</u>', '')
   a = a.replace('”', '"')
   a = a.replace('</p></div>', '')
   a = a.replace(', Helvetica; line-height: 24px;" />', '')
   a = a.replace("'Helvetica Neue'", '')
   a = a.replace('“', '"')
   a = a.replace('</span><p>', '')
   a = a.replace('</html>', '')
   a = a.replace('</body>', '')
   a = a.replace('<script src="http://book.sbkk8.com/js/foot.js" type="text/javascript"></script>', '')
   a = a.replace('</div>', '')
   a = a.replace(
         '<p>声明 :本网站尊重并保护知识产权，根据《信息网络传播权保护条例》，如果我们转载的作品侵犯了您的权利,请通知我们，我们会及时删除。联系Q-Q：2-8-1-6-4-1-1-4-9-5</p>', '')
   a = a.replace('<p>2016 <b><a href="http://book.sbkk8.com">book.sbkk8.com</a></b> - 版权所有</p>', '')
   a = a.replace('<div id="footer"> ', '')
   a = a.replace(' <script src="http://book.sbkk8.com/js/pinglun.js" type="text/javascript"></script>', '')
   a = a.replace('<script src="http://book.sbkk8.com/js/adBottom.js" type="text/javascript"></script>', '')
   a = a.replace('<div class="sohuCmt1">', '')
   a = a.replace('<div class="adBottom">', '')
   a = a.replace('>【下一篇】<i>：</i></a></span>', '')
   a = a.replace("'pagedaohang'", '')
   a = a.replace('class=', '')
   a = a.replace("'/gushihui/taijiaogushi/92709.html'", '')
   a = a.replace('</span><span "toup1"><a href=', '')
   a = a.replace('<script src="http://book.sbkk8.com/js/adBottom_sougou.js" type="text/javascript"></script>', '')
   a = a.replace(
         '>【第一篇】</a></span><span "tomenu"><a href="/gushihui/taijiaogushi/" title="胎教故事" "returnIndex">【回目录】</a>',
         '')
   a = a.replace(" 'pagedaohang pagedaohang1'", '')
   a = a.replace('<div "prenext prenext1"><span "todown1"><a', '')
   a = a.replace('<p>', '')
   a = a.replace('</p>', '')
   a = a.replace('"', '"')
   a = a.replace('……', '')
   story.append(a)

print(len(story))
for i in range(len(story)):
   with open(name_list+'.txt','w') as f:
         f.write(story)
         print(name_list+'       已爬取！')

if __name__ == "__main__":
get_stroy()

页: [1]

油猴中文网's Archiver

睡前故事python