使用XPath分析一下比较复杂的贴吧
先上代码
#!/usr/bin/env python
#-*- coding:utf-8 -*-
from lxml import etree
import requests
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
url = 'https://tieba.baidu.com/p/5098845608?pn=1'
html = requests.get(url)
selector = etree.HTML(html.text)
img_all_list = [] # 存储所有图片链接
content_field = selector.xpath('//div[@class="l_post l_post_bright j_l_post clearfix "]') # 获取指定内容
for each in content_field[1:]:
author = each.xpath('div[1]/ul/li[@class="d_name"]/a/text()')[0] # 获取作者名称
content = each.xpath('div[2]/div[@class="p_content "]/cc/div/text()') # 获取贴吧内容
time = each.xpath('div[2]/div[@class="core_reply j_lzl_wrapper"]/div[1]/div[@class="post-tail-wrap"]/span[last()]/text()')[0] # 获取发帖时间
img_list = each.xpath('div[2]/div[@class="p_content "]/cc/div//@src') # 获取图片链接
img_all_list.append(img_list)
print author
print "\n".join(content).strip()
print time
print '\n'
i = 0
for img_list in img_all_list: # 下载图片模块
for img_url in img_list:
pic = requests.get(img_url)
string = str(i + 1) + img_url[-4:]
fp = open(string, 'wb')
fp.write(pic.content)
fp.close()
i += 1
结果如下(当然还有相亲者图片0-0):
如果熟悉Xpath语法,会很容易爬取这个网页的内容。因为上一篇内容分析过使用Xpath抓取网页信息,这一篇就不详细再说,原理都一样。而这一次纯属拿来玩玩~
需要注意的两个地方
1.last()的使用
time = each.xpath('div[2]/div[@class="core_reply j_lzl_wrapper"]/div[1]/div[@class="post-tail-wrap"]/span[last()]/text()')[0]
last()表示返回元素最后一个值,在本例中源码中,我们可以看到
对于同一个标签'div[@class="post-tail-wrap"]/span',回帖时间有时候出现在第三个span标签,有时候出现在第四个span标签,但共同点都是处在最后一个span标签中,所以采用last()值
2. Xpath获取图片
img_list = each.xpath('div[2]/div[@class="p_content "]/cc/div//@src')
“*//@src”用于获取所有图片链接
爬取前10页信息
#-*_coding:utf8-*-
import requests
from lxml import etree
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
def get_Allurl(): # 获取贴吧前10页内容
page = []
url = 'https://tieba.baidu.com/p/5098845608?pn='
for i in range(1, 11):
a = url + str(i)
page.append(a)
return page
def spider(url):
info_list=[]
html = requests.get(url,timeout = 5) # 如果5秒内网页没有响应访问请求,则直接结束
selector = etree.HTML(html.text)
reply = selector.xpath('//div[@class="l_post l_post_bright j_l_post clearfix "]')
for each in reply:
author = each.xpath('div[1]/ul/li[@class="d_name"]/a/text()')
if len(author) == 0:
continue
author = author[0]
content = each.xpath('div[2]/div[@class="p_content "]/cc/div/text()')
time = each.xpath('div[2]/div[@class="core_reply j_lzl_wrapper"]/div[1]/div[@class="post-tail-wrap"]/span[last()]/text()')[0]
info = {}
info['author'] = author
info['reply'] = "\n\t".join(content).strip()
info['time'] = time
info_list.append(info)
return info_list
def saveinfo(classinfo): # 保存信息
f = open('tiebainfo.txt', 'w')
for info_all in classinfo:
for each in info_all:
f.writelines('Author: ' + each['author'] + '\n')
f.writelines('Content:\n\t' + each['reply'] + '\n')
f.writelines('Time: ' + each['time'] + '\n\n')
f.close
if __name__ == '__main__':
classinfo = []
all_url = get_Allurl()
for url in all_url:
print u'正在处理:' + url
info_list = spider(url)
classinfo.append(info_list)
saveinfo(classinfo)
就到这吧