在网上复制了别人baike_spider爬虫的代码,可是每次运行都是出现,只爬取一条就failed的结果
craw 1 : http://baike.baidu.com/view/21087.htm
craw failed
百度比对别人的代码及综合各大神的建议,发现了如下两个问题:
1.在html_downloader.py文件中
importurllib.request #不是import urllib 而是import urllib.request
classHtmlDownloader(object):
defdownload(self, url):
ifurlis None:
return None
response = urllib.request.urlopen(url)
ifresponse.getcode() !=200:
return None
returnresponse.read()
2. 是在html_parser.py文件中正则表达式的写法的变化
# coding=utf-8_*_
frombs4importBeautifulSoup
importre
importurllib.parse #不是import urllib 而是import urllib.parse (同理如上1)
classHtmlParser(object):
def_get_new_urls(self, page_url, soup):
new_urls =set()
# /view/123.htm
#links = soup.find_all('a', href=re.compile(r"/view/\d+\.htm")) #网上代码的正则表达式写法
links = soup.find_all('a',href=re.compile(r"/item/")) #正则表达式发生改变后的写法
forlinkinlinks:
new_url = link['href']
new_full_url = urllib.parse.urljoin(page_url, new_url)
new_urls.add(new_full_url)
returnnew_urls
def_get_new_data(self, page_url, soup):
res_data = {}
# url
res_data['url'] = page_url
title_node = soup.find('dd',class_="lemmaWgt-lemmaTitle-title").find("h1")
res_data['title'] = title_node.get_text()
#
summary_node = soup.find('div',class_="lemma-summary")
res_data['summary'] = summary_node.get_text()
returnres_data
defparse(self,page_url,html_cont):
ifpage_urlis None orhtml_contis None:
return
soup = BeautifulSoup(html_cont,'html.parser',from_encoding='utf-8')
new_urls =self._get_new_urls(page_url, soup)
new_data =self._get_new_data(page_url, soup)
returnnew_urls, new_data
更改后,运行无问题