爬虫—xpath用法

"""
__coding__ = 'UTF-8'
__author__ = 'bingo'
__date__ = '2020/8/21'
# code is far away from bugs with the god animal protecting
    I love animals. They taste delicious.
             ┏┓   ┏┓
            ┏┛┻━━━┛┻━━┓
            ┃    ☃    ┃
            ┃  ┳┛  ┗┳ ┃
            ┃     ┻   ┃
            ┗━┓     ┏━┛
              ┃     ┗━━━━━┓
              ┃  神兽保佑  ┣┓
              ┃　永无BUG！ ┏┛
              ┗━━━┓┓┏━━┳┓┏┛
                  ┃┫┫  ┃┫┫
                  ┗┻┛  ┗┻┛
"""
from lxml import etree
html_content = """
<div class="nav-items">
  <ul>
    <li  class="li li1" name="购票"><a href="https://movie.douban.com/cinema/nowplaying/">影讯&购票</a></li>
    <li  class="li li2"><a href="https://movie.douban.com/explore">选电影</a></li>
    <li  class="li li3"><a href="https://movie.douban.com/tv/">电视剧</a></li>
    <li  class="li li4"><a href="https://movie.douban.com/chart">排行榜</a></li>
    <li  class="li li5"><a href="https://movie.douban.com/tag/">分类</a></li>
    <li  class="li li6"><a href="https://movie.douban.com/review/best/">影评</a></li>
    <li  class="li li7"><a href="https://movie.douban.com/annual/2019?source=navigation">2019年度榜单</a></li>
    <li  class="li8"><a href="https://m.douban.com/standbyme/annual2019?source=navigation" target="_blank">2019书影音报告</a></li>
  </ul>
  <a href="www.baidu.com">百度一下，你就知道</a>
</div>
"""

# 初始化一个xpath对象
html = etree.HTML(html_content)

# 1、获取所有节点
a = html.xpath("//*")
print(a)
# >>> [<Element html at 0x1cac7ec3c88>, <Element body at 0x1cac7f27588>, <Element div at 0x1cac7f27dc8>,
# <Element ul at 0x1cac7f27e08>, <Element li at 0x1cac7f27e48>, <Element a at 0x1cac7f27448>,
# <Element li at 0x1cac7f27e88>, <Element a at 0x1cac7f27ec8>, <Element li at 0x1cac7f27f08>,
# <Element a at 0x1cac7f27248>, <Element li at 0x1cac7f27f48>, <Element a at 0x1cac7f27f88>,
# <Element li at 0x1cac7f27fc8>, <Element a at 0x1cac7f30048>, <Element li at 0x1cac7f30088>,
# <Element a at 0x1cac7f300c8>, <Element li at 0x1cac7f30108>, <Element a at 0x1cac7f30148>,
# <Element li at 0x1cac7f30188>, <Element a at 0x1cac7f301c8>]

# 2、获取所有a标签节点
b = html.xpath("//a")
print(b)
# >>> [<Element a at 0x1cac7f27448>, <Element a at 0x1cac7f27ec8>,
# <Element a at 0x1cac7f27248>, <Element a at 0x1cac7f27f88>,
# <Element a at 0x1cac7f30048>, <Element a at 0x1cac7f300c8>,
# <Element a at 0x1cac7f30148>, <Element a at 0x1cac7f301c8>]

# 3、获取直属子节点
c = html.xpath("//div/*")
print(c)
# >>> [<Element ul at 0x1cac7f27e08>]

# 4、获取子孙节点
d = html.xpath("//div//*")
print(d)
# >>> [<Element ul at 0x1cac7f27e08>, <Element li at 0x1cac7f27e48>, <Element a at 0x1cac7f27448>,
# <Element li at 0x1cac7f27e88>, <Element a at 0x1cac7f27ec8>, <Element li at 0x1cac7f27f08>,
# <Element a at 0x1cac7f27248>, <Element li at 0x1cac7f27f48>, <Element a at 0x1cac7f27f88>,
# <Element li at 0x1cac7f27fc8>, <Element a at 0x1cac7f30048>, <Element li at 0x1cac7f30088>,
# <Element a at 0x1cac7f300c8>, <Element li at 0x1cac7f30108>, <Element a at 0x1cac7f30148>,
# <Element li at 0x1cac7f30188>, <Element a at 0x1cac7f301c8>]

# 5、获取父节点
e1 = html.xpath("//ul/..")
e2 = html.xpath("//ul/parent::*")
print(e1)
print(e2)
# >>> [<Element div at 0x1e404d17d48>]
# >>> [<Element div at 0x1e404d17d48>]

# 6、属性匹配
f1 = html.xpath("//li[@class='li8']")             # 针对一个属性只有一个值
f2 = html.xpath("//li[contains(@class, 'li1')]")  # 针对一个属性对应多个值(contains)
f3 = html.xpath("//li[contains(@class, 'li') and @name='购票']/a/text()")    # 针对多属性匹配
print(f1)
print(f2)
print(f3)
# >>> [<Element li at 0x22ea0bc10c8>]
# >>> [<Element li at 0x22ea0bb8d88>]
# >>> ['影讯&购票']

# 7、属性获取
g1 = html.xpath("//li[@name='购票']/a/@href")
g2 = html.xpath("//li[@name='购票']/@class")
print(g1)
print(g2)
# >>> ['https://movie.douban.com/cinema/nowplaying/']
# >>> ['li li1']

# 8、文本获取
h = html.xpath("//li//text()")
print(h)
# >>> ['影讯&购票', '选电影', '电视剧', '排行榜', '分类', '影评', '2019年度榜单', '2019书影音报告']

# 9、按序选择
i1 = html.xpath("//li[1]/a/text()")                 # 选区第一个li
i2 = html.xpath("//li[last()]/a/text()")            # 选取最后一个li
i3 = html.xpath("//li[position()<4]/a/text()")      # 选取前3个
i4 = html.xpath("//li[last()-1]/a/text()")          # 选区倒数第二个li
print(i1)
print(i2)
print(i3)
print(i4)
# >>> ['影讯&购票']
# >>> ['2019书影音报告']
# >>> ['影讯&购票', '选电影', '电视剧']
# >>> ['2019年度榜单']

# 10、节点轴选择
"""
ancestor                    选取当前节点的所有先辈（父、祖父等）。
ancestor-or-self            选取当前节点的所有先辈（父、祖父等）以及当前节点本身。
attribute                   选取当前节点的所有属性。
child                       选取当前节点的所有子元素。
descendant                  选取当前节点的所有后代元素（子、孙等）。
descendant-or-self          选取当前节点的所有后代元素（子、孙等）以及当前节点本身。
following                   选取文档中当前节点的结束标签之后的所有节点。
following-sibling           选取文档中当前节点的结束标签之后的所有同级节点。
parent                      选取当前节点的父节点。
preceding                   选取文档中当前节点的开始标签之前的所有节点。
preceding-sibling           选取当前节点之前的所有同级节点。
self                        选取当前节点。
"""
j1 = html.xpath("//li[1][contains(@class, 'li')]/ancestor::*")              # 所有父辈以及祖辈元素
j2 = html.xpath("//li[1][contains(@class, 'li')]/ancestor-or-self::*")      # 所有父辈以及祖辈元素和当前节点
j3 = html.xpath("//li[1][contains(@class, 'li')]/attribute::*")             # 所有当前节点的属性值
j4 = html.xpath("//li[1][contains(@class, 'li')]/child::a/text()")          # 选取当前元素的子节点a
j5 = html.xpath("//li[1][contains(@class, 'li')]/descendant::*")            # 选取当前元素的子孙节点
j6 = html.xpath("//li[1][contains(@class, 'li')]/descendant-or-self::*")    # 选取当前元素的子孙节点以及当前节点
j7 = html.xpath("//li[1][contains(@class, 'li')]/following::a/text()")      # 选取当前节点结束标签后的所有节点
j8 = html.xpath("//li[1][contains(@class, 'li')]/following-sibling::*/a/text()")      # 选取当前节点结束标签后的所有同级节点
j9 = html.xpath("//li[3][contains(@class, 'li')]/preceding::*")      # 选取当前节点结束标签前的所有节点
j10 = html.xpath("//li[3][contains(@class, 'li')]/preceding-sibling::*")      # 选取当前节点结束标签前的所有同级节点
j11 = html.xpath("//li[1][contains(@class, 'li')]/parent::*")                # 选取当前节点的父节点
print("j1: %s" % j1)
print("j2: %s" % j2)
print("j3: %s" % j3)
print("j4: %s" % j4)
print("j5: %s" % j5)
print("j6: %s" % j6)
print("j7: %s" % j7)
print("j8: %s" % j8)
print("j9: %s" % j9)
print("j10: %s" % j10)
print("j11: %s" % j11)
# >>> j1: [<Element html at 0x1b91f34bf08>, <Element body at 0x1b91f3b7c08>, <Element div at 0x1b91f3b7c88>, <Element ul at 0x1b91f3b7a88>]
# >>> j2: [<Element html at 0x1b91f34bf08>, <Element body at 0x1b91f3b7c08>, <Element div at 0x1b91f3b7c88>, <Element ul at 0x1b91f3b7a88>, <Element li at 0x1b91f3b7b88>]
# >>> j3: ['li li1', '购票']
# >>> j4: ['影讯&购票']
# >>> j5: [<Element a at 0x1b91f3ba508>]
# >>> j6: [<Element li at 0x1b91f3b7b88>, <Element a at 0x1b91f3ba508>]
# >>> j7: ['选电影', '电视剧', '排行榜', '分类', '影评', '2019年度榜单', '2019书影音报告', '百度一下，你就知道']
# >>> j8: ['选电影', '电视剧', '排行榜', '分类', '影评', '2019年度榜单', '2019书影音报告']
# >>> j9: [<Element li at 0x1b91f3b7b88>, <Element a at 0x1b91f3ba508>, <Element li at 0x1b91f3ba548>, <Element a at 0x1b91f3ba588>]
# >>> j10: [<Element li at 0x1b91f3b7b88>, <Element li at 0x1b91f3ba548>]
# >>> j11: [<Element ul at 0x1b91f3b7a88>]