Python爬取新浪国际新闻推荐板块数据

python + selenium 进行浏览器模拟爬取新浪国际新闻板块
# -*- conding: utf8 -*-
"""
author : soliton/念旧
Email  ： soliton.wang@gmail.com
 QQ    ： 1670829014
Spider_News:    新浪新闻国际页面
"""
from selenium import webdriver
from lxml import etree
from lxml import cssselect
from pymysql import connect
import re
import time

mysqldb = connect(host='localhost', port=3306, database='database', user='username', password='password', charset='utf8')
cs = mysqldb.cursor()

New_site = "https://news.sina.com.cn/world/"
News_url = []
News_name = '新浪新闻'

"""
获取国际新闻的版块单个新闻链接存到“News_url”列表里
"""
browser = webdriver.Chrome()
browser.get(New_site)
for i in range(1000):
    exec_javascript = "var q=document.documentElement.scrollTop={}".format(i * 1000)
    browser.execute_script(exec_javascript)
html = etree.HTML(browser.page_source)
new_1 = html.xpath('//*[@id="subShowContent1_news1"]//h2/a/@href')
for new in new_1:
    News_url.append(new)
new_2 = html.xpath('//*[@id="subShowContent1_news2"]//h2/a/@href')
for new in new_2:
    News_url.append(new)
new_3 = html.xpath('//*[@id="subShowContent1_news3"]//h2/a/@href')
for new in new_3:
    News_url.append(new)
new_4 = html.xpath('//*[@id="subShowContent1_news4"]//h2/a/@href')
for new in new_4:
    News_url.append(new)
"""
循环处列表里的链接进行访问,并获取需要的相关信息
新闻链接
标题
发布人
发布时间
发布正文内容
"""
for url in News_url:
    browser.get(url)
    article_html = etree.HTML(browser.page_source)
    for i in range(len(article_html.cssselect('div.main-content.w1240 > h1'))):
        content = article_html.cssselect('div.main-content.w1240 > h1')[i]
        article_title = ''.join(content.text)

    article_behot_time = ''.join(article_html.xpath('//*[@id="top_bar"]/div/div[2]/span/text()'))
    article_author = ''.join(article_html.xpath('//*[@id="top_bar"]/div/div[2]/a/text()'))
    article_content_list = ''.join(article_html.xpath('//*[@class="article"]//p/text()'))
    article_content = ''.join(re.sub('\s', '', article_content_list))
    query = "insert ignore into toutiao(class,title,source,source_url,behot_time,nowtime,abstract) values (%s, %s, %s, %s, %s, %s, %s)"
    now_time = time.strftime("%Y-%m-%d %H:%M:%S")
    values = (News_name, article_title, article_author, url, article_behot_time, now_time, article_content)
    cs.execute(query, values)
    mysqldb.commit()
    time.sleep(2)
cs.close()
browser.close()
"""
article_title       标题      div.main-content.w1240 > h1
behot_time          发布时间    //*[@id="top_bar"]/div/div[2]/span/text()
article_author      发布人     //*[@id="top_bar"]/div/div[2]/a/text()
article_content     正文内容         //*[@id="article"]
news_url            新闻链接
site_name           新闻站点名称
"""
最后编辑于：2020.10.08 15:28:09