通过python批量自动化下载斗破苍穹小说
效果是这样的
我的代码
import re
import urllib.request
import time
url='http://www.liewen.cc/b/0/18/'
def get_list_link(url):
web_open=urllib.request.urlopen(url)
web_html=web_open.read().decode('gbk')
# time.sleep(1)
# print(web_html)
reg=re.compile(r'<dd><a href="(.*?)">(.*?)</a></dd>')
list_links=re.findall(reg,web_html)
# print(list_links)
num=0
for link in list_links:
full_link='http://www.liewen.cc'+link[0]
title=link[1]
# print(full_link,title)
num=num+1
filename='E:/novel/{}.{}.txt'.format(num,title)
print('正在下载:%s,访问链接是:%s'% (title,full_link))
with open(filename,'a') as f:
f.write(title+'\n')
get_detail_info(full_link,filename)
#return full_link
def get_detail_info(url,filename):
web_data=urllib.request.urlopen(url)
web_html=web_data.read().decode('gbk')
# print(web_html)
reg=re.compile('<div id="content"> (.*?)</div>')
detail_infos=re.findall(reg,web_html)
if detail_infos:
clean_infos=detail_infos[0].replace('<br /><br /> ','\n')
# print(clean_infos)
with open(filename,'a') as F:
F.write(clean_infos)
if __name__=='__main__':
get_list_link(url)
总结
- 爬虫思路-获取多页访问链接->获取每页图片链接->正文下载;
- 正则表达式的使用;
- format与with open as语法的使用;
- 编码方式调整;
- urllib.request与re模块的使用。