第一部分修改后已经能够正常显示,主要问题是不同的帖子有些地方缺省需要补齐,不然不好处理。处理文本的能力还是要加强啊。
csv文件以wb方式打开就不会每一行之后都有一个空行。
def parse_title():
# sum_page = get_total_page(start_url)
rows = []
for num in range(1, 23):
url = "http://guba.eastmoney.com/list,meigu_" + str(num) + ".html"
html = requests.get(url=url, headers=headers).content
selector = etree.HTML(html)
items = selector.xpath("//div[@id='articlelistnew']/div[position()>1 and position()<last()]")
for item in items:
title = item.xpath("span[@class='l3']/a/text()")[0].decode(encoding='utf-8')
author_temp = item.xpath("span[@class='l4']/a/text()") if item.xpath("span[@class='l4']/a/text()") else [
u'匿名网友']
author = author_temp[0].decode(encoding='utf-8')
read = item.xpath("span[@class='l1']/text()")[0]
comment_num = item.xpath("span[@class='l2']/text()")[0]
post_time = item.xpath("span[@class='l6']/text()")[0]
last_update = item.xpath("span[@class='l5']/text()")[0]
link = item.xpath("span[@class='l3']/a/@href")
complete_link = 'http://guba.eastmoney.com' + link[0] if str(link[0]).startswith('/') else 'http://guba.eastmoney.com/' + link[0]
rows.append(
{'title': title, 'author': author, 'read': read, 'comment_num': comment_num, 'post_time': post_time,
'last_update': last_update, 'link': link, 'complete_link': complete_link})
return rows