"""
Author: soliton
Email : soliton.wang@gmail.com
"""
import time
import asyncio
import requests
from lxml import etree
from pymysql import connect
from pyppeteer.launcher import launch
mysqldb = connect(host='host', port=3306, database='database', user='user', password='password', charset='utf8')
cs = mysqldb.cursor()
name = "西瓜视频"
author = '燃新闻视频'
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTHL, like Gecko) Chrome/66.0.3359.181 Safari/537.36",
"referer": "https://www.ixigua.com/"
}
video_news_url =[]
async def Get_News_url(url):
params = {
# 关闭无头浏览器
"headless": True,
'dumpio': 'True', # 防止浏览器卡住
"args": [
'--no-sandbox',
'--disable-infobars', # 关闭自动化提示框
'--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',
],
}
browser = await launch(**params)
pages = await browser.pages()
page = pages[0]
await page.setViewport({'width': 900, 'height': 768})
await page.setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTHL, like Gecko) Chrome/66.0.3359.181 Safari/537.36")
await page.goto(url)
for i in range(0, 10):
js ="var q=document.documentElement.scrollTop={}".format(i * 1000)
await page.waitFor(300)
content = await page.content()
await page.close()
tree = etree.HTML(content)
urls = tree.xpath('//*[@id="App"]/div/div[3]/div/div[2]/div/div[2]/div[1]//div[2]/a/@href')
for url in urls:
video_news_url.append("https://www.ixigua.com" + url)
def get_content_insert_mysql(video_news_url):
for url in video_news_url:
resp = requests.get(url=url, headers=headers).content.decode('utf-8')
tree = etree.HTML(resp)
title = ''.join(tree.xpath('//*[@id="App"]/div/div[2]/div/div[3]/div[2]/div[1]/div/h1/text()'))
play_view = ''.join(tree.xpath('//*[@id="App"]/div/div[2]/div/div[3]/div[2]/div[2]/div[1]/span[1]/text()'))
publish = ''.join(tree.xpath('//*[@id="App"]/div/div[2]/div/div[3]/div[2]/div[2]/div[1]/span[3]/@data-publish-time'))
publish_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(publish)))
content = ''.join(tree.xpath('//*[@id="App"]/div/div[2]/div/div[3]/div[2]/div[2]/div[2]/div[1]/text()'))
now_time = time.strftime("%Y-%m-%d %H:%M:%S")
query = "insert ignore into jt_spider(class, title, source, source_url, comments_count, abstract, behot_time, nowtime) value (%s, %s, %s, %s, %s, %s, %s, %s)"
values = (name, title, author, url, play_view, content, publish_time, now_time)
print(values)
cs.execute(query, values)
mysqldb.commit()
time.sleep(3)
if __name__ == '__main__':
news_url = 'https://www.ixigua.com/home/6886776520'
asyncio.get_event_loop().run_until_complete(Get_News_url(url=news_url))
get_content_insert_mysql(video_news_url)
cs.close()
Python爬取西瓜视频相关信息
©著作权归作者所有,转载或内容合作请联系作者
- 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
- 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
- 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...