# -*- coding: utf-8 -*-
'''
本代码用来分析简书的首页部分
'''
from bs4 import BeautifulSoup
import re
url = 'http://www.jianshu.com'
pattern = re.compile(r'\d+')
def parse(html):
if html == None:
return None
soup = BeautifulSoup(html,'html5lib')
'''
查找H4
'''
ul = soup.find('ul',attrs={'class':'article-list'})
h4list = h4list = ul.find_all('h4')
for h4 in h4list:
#找标题和文章链接
a = h4.find('a')
title = a.text
atricle_url = url+a['href']
div = h4.parent
#找用户信息
p = div.find('p')
usera = p.find('a')
user_url = url+usera['href']
username = usera.text
#时间
span_time = p.find('span')['data-shared-at']
#阅读 评论 喜欢 打赏
footer = div.find('div')
a_list = footer.find_all('a')
read_num = pattern.findall(a_list[0].text)[0]
comment_num = pattern.findall(a_list[1].text)[0]
span_list = footer.find_all('span')
like_num = pattern.findall(span_list[0].text)[0]
ds_num = pattern.findall(span_list[1].text)[0]
'''
获取加载更多url
'''
loadmore = soup.find('button',attrs={'class':'ladda-button'})
nexturl = url+loadmore['data-url']