先写一点准备的函数,文件名为getZhihuInfo.py
import requests
from bs4 import BeautifulSoup
import json
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.8',
'Connection': 'keep-alive',
'Cache-Control':'max-age=0',
'Cookie':'#你自己的',
'Host': 'www.zhihu.com',
'Referer': 'https://www.zhihu.com/people',
'User-Agent':'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.130 Safari/537.36',
}
headers_post = {
'Accept':'*/*',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.8',
'Connection':'keep-alive',
'Content-Length':'16',
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
'Referer':'https://www.zhihu.com/people',
'User-Agent':'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.130 Safari/537.36',
'Cookie':'#你自己的',
'Host':'www.zhihu.com',
'Origin':'https://www.zhihu.com',
'X-Requested-With':'XMLHttpRequest',
'X-Xsrftoken':'82f9b2f5e3166156c04eeb491ac6f21e'
}
#判断是否为空
def setValue(soupS):
if soupS:
return soupS[0].get_text()
else:
return 'Unknown'
#获得每个用户Url的基础信息,返回一个list
def getBasicInfo(peopleUrl):
wb_data = requests.get(peopleUrl,headers = headers)
soup = BeautifulSoup(wb_data.text,'lxml')
name = soup.select('div.title-section > span')[0].get_text()
alocation = soup.select('span.location.item')
abusiness = soup.select('span.business.item')
agender = soup.select('span.item.gender > i')
aemployment = soup.select('span.employment.item')
aposition = soup.select('span.position.item')
aeducation = soup.select('span.education.item')
aeducation_extra = soup.select('span.education-extra.item')
location = setValue(alocation)
business = setValue(abusiness)
if agender:
gender = agender[0].get('class')[1][13:]
else:
gender = 'Unknown'
employment = setValue(aemployment)
position = setValue(aposition)
education = setValue(aeducation)
education_extra = setValue(aeducation_extra)
agree = soup.select('span.zm-profile-header-user-agree > strong')[0].get_text()
thanks = soup.select('span.zm-profile-header-user-thanks > strong')[0].get_text()
action5 = soup.select('span.num')
asks = action5[0].get_text()
answers = action5[1].get_text()
posts = action5[2].get_text()
if len(action5) > 3:
collections = action5[3].get_text()
logs = action5[4].get_text()
else:
collections = 'Null'
logs = 'Null'
followees = soup.select('a.item > strong')[0].get_text()
followers = soup.select('a.item > strong')[1].get_text()
focus2 = soup.select('div.zm-profile-side-section-title > a > strong')
if len(focus2) == 2:
zl = focus2[0].get_text()[:-3]
ht = focus2[1].get_text()[:-3]
else:
ht = focus2[0].get_text()[:-3]
zl = '0'
basicInfoSet = [name,location,business,gender,employment,position,education,education_extra,agree,thanks,asks,answers,posts,collections,logs,followees,followers,zl,ht]
return basicInfoSet
#获得每个用户关注的用户的URL,返回list
def getFolloweesUrl(OneUrl):
url = OneUrl + '/followees'
wb_data = requests.get(url,headers = headers)
soup = BeautifulSoup(wb_data.text,'lxml')
alist = soup.select('a.zg-link.author-link')
followeeUrlSet = []
if alist:
for i in alist:
followeeUrlSet.append(i.get('href'))
#print(len(followeeSet))
return followeeUrlSet
#获得每个用户某日期最近20条动态,返回一个字典
def postActivitiesByDate(Purl,byDate):
url = Purl + '/activities'
data = {
'start': byDate
}
wb_data = requests.post(url,headers = headers_post,data = data)
#print(wb_data)
soup = BeautifulSoup(wb_data.json()['msg'][1], 'lxml')
activities = soup.select('div.zm-profile-section-item.zm-item.clearfix')
actdata = {}
for i in activities:
actdata[i.get('data-time')] = i.get('data-type-detail')
return actdata
再获得一堆用户的知乎个人主页的网址,存储在MongoDB中:
from getZhihuInfo import getFolloweesUrl
import pymongo
client = pymongo.MongoClient('localhost',27017)
zhiHu = client['zhiHu']
zhiHuId = zhiHu['zhiHuId']
#初始Url,要先找一个个人主页的网址
urlSet = ['']
#初始设置
# zhiHuId.remove()
# fd1 = {
# 'id':0,
# 'followees':urlSet
# }
# zhiHuId.insert_one(fd1)
begin = 0 #初始为0
end = 1000
dbId = 0 #最大的id值
for k in range(begin,end):
for i in zhiHuId.find_one({'id': k})['followees']:
followees = getFolloweesUrl(i)
dbId +=1
fd = {
'id':dbId,
'followees':followees
}
zhiHuId.insert_one(fd)
print(dbId)
然后,把每个Url从数据库中提取出来,然后获得各项数据,存储在数据库中
from getZhihuInfo import setValue,getBasicInfo,postActivitiesByDate
import pymongo
import time
client = pymongo.MongoClient('localhost',27017)
zhiHu = client['zhiHu']
zhiHuId = zhiHu['zhiHuId']
zhiHuDetail = zhiHu['zhiHuDetail']
#初始Url
OneUrl = ''
#需要获取的信息的by日期
byDate160909 = 1473379200
begin = 0 #初始为0
end = 1000
count = 0
for k in range(begin,end):
x = zhiHuId.find_one({'id': k})['followees']
if x:
for i in x:
y = getBasicInfo(i)
z = postActivitiesByDate(i,byDate160909)
oneData = {
'name':y[0],'location':y[1],'business':y[2],
'gender':y[3], 'employment':y[4], 'position':y[5], 'education':y[6],
'education_extra':y[7], 'agree':y[8],'thanks':y[9], 'asks':y[10],
'answers':y[11], 'posts':y[12], 'collections':y[13], 'logs':y[14], 'followees':y[15],
'followers':y[16], 'zl':y[17], 'ht':y[18],
'activities':z
}
zhiHuDetail.insert_one(oneData)
count += 1
print(k,'----',count)