"""
__coding__ = 'UTF-8'
__author__ = 'bingo'
__date__ = '2020/8/21'
# code is far away from bugs with the god animal protecting
I love animals. They taste delicious.
┏┓ ┏┓
┏┛┻━━━┛┻━━┓
┃ ☃ ┃
┃ ┳┛ ┗┳ ┃
┃ ┻ ┃
┗━┓ ┏━┛
┃ ┗━━━━━┓
┃ 神兽保佑 ┣┓
┃ 永无BUG! ┏┛
┗━━━┓┓┏━━┳┓┏┛
┃┫┫ ┃┫┫
┗┻┛ ┗┻┛
"""
from lxml import etree
html_content = """
<div class="nav-items">
<ul>
<li class="li li1" name="购票"><a href="https://movie.douban.com/cinema/nowplaying/">影讯&购票</a></li>
<li class="li li2"><a href="https://movie.douban.com/explore">选电影</a></li>
<li class="li li3"><a href="https://movie.douban.com/tv/">电视剧</a></li>
<li class="li li4"><a href="https://movie.douban.com/chart">排行榜</a></li>
<li class="li li5"><a href="https://movie.douban.com/tag/">分类</a></li>
<li class="li li6"><a href="https://movie.douban.com/review/best/">影评</a></li>
<li class="li li7"><a href="https://movie.douban.com/annual/2019?source=navigation">2019年度榜单</a></li>
<li class="li8"><a href="https://m.douban.com/standbyme/annual2019?source=navigation" target="_blank">2019书影音报告</a></li>
</ul>
<a href="www.baidu.com">百度一下,你就知道</a>
</div>
"""
# 初始化一个xpath对象
html = etree.HTML(html_content)
# 1、获取所有节点
a = html.xpath("//*")
print(a)
# >>> [<Element html at 0x1cac7ec3c88>, <Element body at 0x1cac7f27588>, <Element div at 0x1cac7f27dc8>,
# <Element ul at 0x1cac7f27e08>, <Element li at 0x1cac7f27e48>, <Element a at 0x1cac7f27448>,
# <Element li at 0x1cac7f27e88>, <Element a at 0x1cac7f27ec8>, <Element li at 0x1cac7f27f08>,
# <Element a at 0x1cac7f27248>, <Element li at 0x1cac7f27f48>, <Element a at 0x1cac7f27f88>,
# <Element li at 0x1cac7f27fc8>, <Element a at 0x1cac7f30048>, <Element li at 0x1cac7f30088>,
# <Element a at 0x1cac7f300c8>, <Element li at 0x1cac7f30108>, <Element a at 0x1cac7f30148>,
# <Element li at 0x1cac7f30188>, <Element a at 0x1cac7f301c8>]
# 2、获取所有a标签节点
b = html.xpath("//a")
print(b)
# >>> [<Element a at 0x1cac7f27448>, <Element a at 0x1cac7f27ec8>,
# <Element a at 0x1cac7f27248>, <Element a at 0x1cac7f27f88>,
# <Element a at 0x1cac7f30048>, <Element a at 0x1cac7f300c8>,
# <Element a at 0x1cac7f30148>, <Element a at 0x1cac7f301c8>]
# 3、获取直属子节点
c = html.xpath("//div/*")
print(c)
# >>> [<Element ul at 0x1cac7f27e08>]
# 4、获取子孙节点
d = html.xpath("//div//*")
print(d)
# >>> [<Element ul at 0x1cac7f27e08>, <Element li at 0x1cac7f27e48>, <Element a at 0x1cac7f27448>,
# <Element li at 0x1cac7f27e88>, <Element a at 0x1cac7f27ec8>, <Element li at 0x1cac7f27f08>,
# <Element a at 0x1cac7f27248>, <Element li at 0x1cac7f27f48>, <Element a at 0x1cac7f27f88>,
# <Element li at 0x1cac7f27fc8>, <Element a at 0x1cac7f30048>, <Element li at 0x1cac7f30088>,
# <Element a at 0x1cac7f300c8>, <Element li at 0x1cac7f30108>, <Element a at 0x1cac7f30148>,
# <Element li at 0x1cac7f30188>, <Element a at 0x1cac7f301c8>]
# 5、获取父节点
e1 = html.xpath("//ul/..")
e2 = html.xpath("//ul/parent::*")
print(e1)
print(e2)
# >>> [<Element div at 0x1e404d17d48>]
# >>> [<Element div at 0x1e404d17d48>]
# 6、属性匹配
f1 = html.xpath("//li[@class='li8']") # 针对一个属性只有一个值
f2 = html.xpath("//li[contains(@class, 'li1')]") # 针对一个属性对应多个值(contains)
f3 = html.xpath("//li[contains(@class, 'li') and @name='购票']/a/text()") # 针对多属性匹配
print(f1)
print(f2)
print(f3)
# >>> [<Element li at 0x22ea0bc10c8>]
# >>> [<Element li at 0x22ea0bb8d88>]
# >>> ['影讯&购票']
# 7、属性获取
g1 = html.xpath("//li[@name='购票']/a/@href")
g2 = html.xpath("//li[@name='购票']/@class")
print(g1)
print(g2)
# >>> ['https://movie.douban.com/cinema/nowplaying/']
# >>> ['li li1']
# 8、文本获取
h = html.xpath("//li//text()")
print(h)
# >>> ['影讯&购票', '选电影', '电视剧', '排行榜', '分类', '影评', '2019年度榜单', '2019书影音报告']
# 9、按序选择
i1 = html.xpath("//li[1]/a/text()") # 选区第一个li
i2 = html.xpath("//li[last()]/a/text()") # 选取最后一个li
i3 = html.xpath("//li[position()<4]/a/text()") # 选取前3个
i4 = html.xpath("//li[last()-1]/a/text()") # 选区倒数第二个li
print(i1)
print(i2)
print(i3)
print(i4)
# >>> ['影讯&购票']
# >>> ['2019书影音报告']
# >>> ['影讯&购票', '选电影', '电视剧']
# >>> ['2019年度榜单']
# 10、节点轴选择
"""
ancestor 选取当前节点的所有先辈(父、祖父等)。
ancestor-or-self 选取当前节点的所有先辈(父、祖父等)以及当前节点本身。
attribute 选取当前节点的所有属性。
child 选取当前节点的所有子元素。
descendant 选取当前节点的所有后代元素(子、孙等)。
descendant-or-self 选取当前节点的所有后代元素(子、孙等)以及当前节点本身。
following 选取文档中当前节点的结束标签之后的所有节点。
following-sibling 选取文档中当前节点的结束标签之后的所有同级节点。
parent 选取当前节点的父节点。
preceding 选取文档中当前节点的开始标签之前的所有节点。
preceding-sibling 选取当前节点之前的所有同级节点。
self 选取当前节点。
"""
j1 = html.xpath("//li[1][contains(@class, 'li')]/ancestor::*") # 所有父辈以及祖辈元素
j2 = html.xpath("//li[1][contains(@class, 'li')]/ancestor-or-self::*") # 所有父辈以及祖辈元素和当前节点
j3 = html.xpath("//li[1][contains(@class, 'li')]/attribute::*") # 所有当前节点的属性值
j4 = html.xpath("//li[1][contains(@class, 'li')]/child::a/text()") # 选取当前元素的子节点a
j5 = html.xpath("//li[1][contains(@class, 'li')]/descendant::*") # 选取当前元素的子孙节点
j6 = html.xpath("//li[1][contains(@class, 'li')]/descendant-or-self::*") # 选取当前元素的子孙节点以及当前节点
j7 = html.xpath("//li[1][contains(@class, 'li')]/following::a/text()") # 选取当前节点结束标签后的所有节点
j8 = html.xpath("//li[1][contains(@class, 'li')]/following-sibling::*/a/text()") # 选取当前节点结束标签后的所有同级节点
j9 = html.xpath("//li[3][contains(@class, 'li')]/preceding::*") # 选取当前节点结束标签前的所有节点
j10 = html.xpath("//li[3][contains(@class, 'li')]/preceding-sibling::*") # 选取当前节点结束标签前的所有同级节点
j11 = html.xpath("//li[1][contains(@class, 'li')]/parent::*") # 选取当前节点的父节点
print("j1: %s" % j1)
print("j2: %s" % j2)
print("j3: %s" % j3)
print("j4: %s" % j4)
print("j5: %s" % j5)
print("j6: %s" % j6)
print("j7: %s" % j7)
print("j8: %s" % j8)
print("j9: %s" % j9)
print("j10: %s" % j10)
print("j11: %s" % j11)
# >>> j1: [<Element html at 0x1b91f34bf08>, <Element body at 0x1b91f3b7c08>, <Element div at 0x1b91f3b7c88>, <Element ul at 0x1b91f3b7a88>]
# >>> j2: [<Element html at 0x1b91f34bf08>, <Element body at 0x1b91f3b7c08>, <Element div at 0x1b91f3b7c88>, <Element ul at 0x1b91f3b7a88>, <Element li at 0x1b91f3b7b88>]
# >>> j3: ['li li1', '购票']
# >>> j4: ['影讯&购票']
# >>> j5: [<Element a at 0x1b91f3ba508>]
# >>> j6: [<Element li at 0x1b91f3b7b88>, <Element a at 0x1b91f3ba508>]
# >>> j7: ['选电影', '电视剧', '排行榜', '分类', '影评', '2019年度榜单', '2019书影音报告', '百度一下,你就知道']
# >>> j8: ['选电影', '电视剧', '排行榜', '分类', '影评', '2019年度榜单', '2019书影音报告']
# >>> j9: [<Element li at 0x1b91f3b7b88>, <Element a at 0x1b91f3ba508>, <Element li at 0x1b91f3ba548>, <Element a at 0x1b91f3ba588>]
# >>> j10: [<Element li at 0x1b91f3b7b88>, <Element li at 0x1b91f3ba548>]
# >>> j11: [<Element ul at 0x1b91f3b7a88>]
爬虫—xpath用法
最后编辑于 :
©著作权归作者所有,转载或内容合作请联系作者
- 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
- 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
- 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...