fromlxmlimportetree
importrequests
file=open('xpath.html','r',encoding='utf-8')
html=file.read()
file.close()
# print(html)
selector=etree.HTML(html)
1第一个和第二个 div
div=selector.xpath('//div[@class="works"]/text()')
print(div,type(div))
#第二UL下标签内容
lc=selector.xpath('//ul/text()')
print(lc,type(lc))
3过滤标签打印前三个DIV级URL
infos=selector.xpath('//div[@class="works"][1]/ul[@class="title"][1]/li[position()<4]/a')
forinfoininfos:
a_text=info.xpath('text()')[0]
a_href=info.xpath('@href')[0]
# print(a_text,a_href)
阳光电源网导航
url ='http://www.ygdy8.com/'
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'en-US,en;q=0.8,zh-CN;q=0.6,zh;q=0.4',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'DNT':'1',
'Host':'www.ygdy8.com',
'If-Modified-Since':'Tue, 05 Sep 2017 14:46:00 GMT',
'If-None-Match':'"0448db05526d31:530"',
'Referer':'https://www.google.com/',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
}
req=requests.get(url)
req.encoding ='gb2312'
html=req.text
selector=etree.HTML(html)
info2=selector.xpath('//div[@class="contain"][1]/ul/li/a')