python + selenium 进行浏览器模拟爬取新浪国际新闻板块
# -*- conding: utf8 -*-
"""
author : soliton/念旧
Email : soliton.wang@gmail.com
QQ : 1670829014
Spider_News: 新浪新闻国际页面
"""
from selenium import webdriver
from lxml import etree
from lxml import cssselect
from pymysql import connect
import re
import time
mysqldb = connect(host='localhost', port=3306, database='database', user='username', password='password', charset='utf8')
cs = mysqldb.cursor()
New_site = "https://news.sina.com.cn/world/"
News_url = []
News_name = '新浪新闻'
"""
获取国际新闻的版块单个新闻链接存到“News_url”列表里
"""
browser = webdriver.Chrome()
browser.get(New_site)
for i in range(1000):
exec_javascript = "var q=document.documentElement.scrollTop={}".format(i * 1000)
browser.execute_script(exec_javascript)
html = etree.HTML(browser.page_source)
new_1 = html.xpath('//*[@id="subShowContent1_news1"]//h2/a/@href')
for new in new_1:
News_url.append(new)
new_2 = html.xpath('//*[@id="subShowContent1_news2"]//h2/a/@href')
for new in new_2:
News_url.append(new)
new_3 = html.xpath('//*[@id="subShowContent1_news3"]//h2/a/@href')
for new in new_3:
News_url.append(new)
new_4 = html.xpath('//*[@id="subShowContent1_news4"]//h2/a/@href')
for new in new_4:
News_url.append(new)
"""
循环处列表里的链接进行访问,并获取需要的相关信息
新闻链接
标题
发布人
发布时间
发布正文内容
"""
for url in News_url:
browser.get(url)
article_html = etree.HTML(browser.page_source)
for i in range(len(article_html.cssselect('div.main-content.w1240 > h1'))):
content = article_html.cssselect('div.main-content.w1240 > h1')[i]
article_title = ''.join(content.text)
article_behot_time = ''.join(article_html.xpath('//*[@id="top_bar"]/div/div[2]/span/text()'))
article_author = ''.join(article_html.xpath('//*[@id="top_bar"]/div/div[2]/a/text()'))
article_content_list = ''.join(article_html.xpath('//*[@class="article"]//p/text()'))
article_content = ''.join(re.sub('\s', '', article_content_list))
query = "insert ignore into toutiao(class,title,source,source_url,behot_time,nowtime,abstract) values (%s, %s, %s, %s, %s, %s, %s)"
now_time = time.strftime("%Y-%m-%d %H:%M:%S")
values = (News_name, article_title, article_author, url, article_behot_time, now_time, article_content)
cs.execute(query, values)
mysqldb.commit()
time.sleep(2)
cs.close()
browser.close()
"""
article_title 标题 div.main-content.w1240 > h1
behot_time 发布时间 //*[@id="top_bar"]/div/div[2]/span/text()
article_author 发布人 //*[@id="top_bar"]/div/div[2]/a/text()
article_content 正文内容 //*[@id="article"]
news_url 新闻链接
site_name 新闻站点名称
"""