import requests
from lxml import etree
import re
def get_cate_info(url):
res = requests.get(url)
res.encoding = 'gb2312'
html = res.text
selector = etree.HTML(html)
infos = selector.xpath('//div[@class="contain"]/ul/li/a') # 类目名称
# print(len(infos))
del infos[-1]
del infos[-1]
del infos[-1]
del infos[-1]
# print(len(infos))
for info in infos:
name = info.xpath('text()')[0]
url = 'http://www.ygdy8.com' + info.xpath('@href')[0]
# print(name, url)
get_movie_url(url,name)
def get_movie_url(url,cate_name):
res = requests.get(url)
res.encoding = 'gb2312'
all_page = re.findall('共(.*?)页',res.text)
kind = re.findall('<option value=\'(list_.*?_).*?',res.text)
# print(all_page)
# print(kind)
if len(all_page)>0:
#使用split函数以'/'切片获取最后一个,用rstrip函数删除,加上kind获取的字符串
kind_url = url.rstrip(url.split('/')[-1]) + str(kind[0])
# print(kind_url)
for page in range(1,int(all_page[0])+1):#遍历所有的页数
page_url = kind_url + str(page) + '.html'
# print(page_url)
resp = requests.get(page_url)
resp.encoding = 'gb2312'
html = resp.text
# print(html)
selector = etree.HTML(html)
infos = selector.xpath('//table[@class="tbspan"]//a[@class="ulink"]')
for info in infos:
moview_url = 'http://www.ygdy8.com' + info.xpath('@href')[0]
# print(moview_url)
# print(cate_name)
moview_name = info.xpath('text()')[0]
get_movie_source(moview_url,url,cate_name,moview_name)
# print(" 类别:{} \n 名字:{} \n 地址:{} \n ".format(cate_name, moview_name, moview_url))
def get_movie_source(moview_url,cate_url,cate_name,moview_name):
new_res = requests.get(moview_url)
new_res.encoding = 'gb2312'
new_html = new_res.text
new_select = etree.HTML(new_html)
source_text = new_select.xpath('//tbody//tr/td/a/text()')[0]
print(" 类别:{} \n 名字:{} \n 地址:{} \n 迅雷下载地址:{} \n".format(cate_name, moview_name, moview_url, source_text))
if __name__ == '__main__':
url = "http://www.ygdy8.com/"
get_cate_info(url)
在这个方法里get_cate_info 删掉了4个元素是因为后边的收藏本站,加入主页,游戏下载,高分经典也在这个里边,所以删掉了他们四个