使用BeautifulSoup入门源码
视频地址
使用python2.7
使用Chrome网页,右键检查工具获取需要内容的类或者id
#coding=utf-8
import requests #网络请求
from bs4 import BeautifulSoup#网页元素解析
import json#json数据解析
import re#正则表达式
import time#延时
import pandas#数据分析
import sqlite3#数据存储
#newsurl = "http://news.sina.com.cn/china"
#获取评论数
common_js = "http://comment5.news.sina.com.cn/page/info?version=1&format=json&channel=gn&newsid=comos-{}&group=undefined&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=3&t_size=3&h_size=3&thread=1&callback=jsonp_1517920852693&_=1517920852693"
#获取评论数
def getNewsConmmontUrl(new_url):
# news_id = sub_url.split("/")[-1].strip("doc-i").rstrip(".shtml")
#print(news_id)
news_id = re.search("doc-i(.+).shtml",new_url).group(1) #使用正则获取新闻id
res = requests.get(common_js.format(news_id))
response = res.text.strip("jsonp_1517920852693(")[:-1]
jd = json.loads(response)
if jd["result"] != None and jd["result"]["count"] != None and jd["result"]["count"]["total"]:
return jd["result"]["count"]["total"]
return 0
#获取文章详情
def getNewsDetail(sub_new_url):
result = {}
res = requests.get(sub_new_url)
soup = BeautifulSoup(res.text.encode(res.encoding).decode('utf-8'),"lxml")#lxml是编译器类型,也可指定html.parser
result["articleUrl"] = sub_new_url;
result["articleTitle"] = soup.select(".main-title")[0].text#文章标题
result["articleTime"] = soup.select(".date-source")[0].select("span")[0].text#时间
result["articleContent"] = "\n".join([p.text.strip() for p in soup.select("#article p")[:-1]]) #文章内容
result["articleAuthor"] = soup.select(".show_author")[0].text#作者
result["articleComment"] = getNewsConmmontUrl(sub_new_url)#评论
if len(soup.select(".img_wrapper")) > 0:
result["articlePicture"] = soup.select(".img_wrapper")[0].select("img")[0]["src"]#文章图片
return result
#分页链接请求
#page_common_url = "http://api.roll.news.sina.com.cn/zt_list?channel=news&cat_1=gnxw&cat_2==gdxw1||=gatxw||=zs-pl||=mtjj&level==1||=2&show_ext=1&show_all=1&show_num=22&tag=1&format=json&page={}&callback=newsloadercallback&_=1518067220351"
page_common_url = "http://api.roll.news.sina.com.cn/zt_list?channel=news&cat_1=gnxw&cat_2==gdxw1||=gatxw||=zs-pl||=mtjj&level==1||=2&show_ext=1&show_all=1&show_num=22&tag=1&format=json&page={}&callback=newsloadercallback"
#获取分页数据
def getPageDataList(pageIndex):
page_url = page_common_url.format(pageIndex)#填充url
page_url_request = requests.get(page_url)#请求数据
page_url_jd = json.loads(page_url_request.text.lstrip(" newsloadercallback(").rstrip(");"))#解析数据
sub_url_array = []
for sub_item in page_url_jd["result"]["data"]:
sub_url = sub_item["url"]
if sub_url.find("http") != -1:#判断是否是http开头的url
sub_url_array.append(sub_url)
return sub_url_array#新闻url列表
#获取前2页新闻链接URL
def getTotalNewUrlList():
sub_url_total_array = []
for i in (0,2):
sub_url_total_array.extend(getPageDataList(i))#extend可将数组中的数据一个一个的添加进入列表,append是将数组作为一个元素添加到列表
return sub_url_total_array#列表,存放所有文章url
#获取所有文章内容
def getTotalNewsDetail():
total_detail_list = []
for sub_total_url in getTotalNewUrlList():#通过分页获取所有新闻链接
resultDic = getNewsDetail(sub_total_url)#获取链接详情字典
time.sleep(1)#爬虫尽量多的延时,也可访问网站的robots.txt查看是否允许爬虫及爬虫的时间,网站架构。。。
for i in resultDic:
print(resultDic[i])#打印文章详情
total_detail_list.append(resultDic)#添加详情字典到列表
return total_detail_list
#使用sqlite存储数据,pandas清晰展示数据
def write_data(total_news):
df = pandas.DataFrame(total_news)#将数据放入pandas中
with sqlite3.connect('news.sqlite') as db:
df.to_sql('news', con = db)#将df数据写入数据库
#读取数据
def read_data():
with sqlite3.connect('news.sqlite') as db:
df = pandas.read_sql_query('select * from news', con = db)#将数据库数组读出
print(df)
#入口函数
def main():
total_news = getTotalNewsDetail()#列表,获取新闻详情
write_data(total_news)#写入数据库
read_data()#从数据库读取并打印
if __name__ == '__main__':
main()