这次是分析网页获得后台API的接口网址,然后直接进行爬取。随着学习深入,可将爬取内容直接写入数据库。
import requests
def getJobDetail(jobinfo):
jobname = jobinfo['title']
source= jobinfo['source']
compname = jobinfo['officialname']
education = jobinfo['ori_education']
experience = jobinfo['experience']
salary = jobinfo['ori_salary']
return "{jn} | 学历要求:{edu} | 经验要求:{exp} | 薪资:{salary} | 公司名:{cn} | 来源:{src} ".format(
jn=jobname,edu=education,exp=experience,salary=salary,cn=compname,src=source)
baiduAPI = 'http://zhaopin.baidu.com/api/quanzhiasync'
keyword = input('请输入关键字: ')
city = input('请输入城市: ')
headerAPI = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
'X-Requested-With':'XMLHttpRequest',
'Host':'zhaopin.baidu.com',
'Accept-Language':'zh-CN,zh;q=0.8'
}
parmsAPI ={'query':keyword,
'sort_type':1,
'city':city,
'detailmode':'close',
'rn':20,
'pn':0}
s = requests.Session()
s.headers.update(headerAPI)
resp = s.get(baiduAPI,params=parmsAPI)
content = resp.json()
resultset= content['data']['main']['data']
total_job_cnt = int(resultset['dispNum'])
if parmsAPI['rn']>= total_job_cnt:
parmsAPI['rn'] = total_job_cnt
print(resultset['dispNum'],resultset['listNum'])
print("total {} jobs printed".format(total_job_cnt))
with open('zhaopin.txt','w',encoding='utf-8') as f:
while(total_job_cnt>0):
joblist= content['data']['main']['data']['disp_data']
for jobinfo in joblist:
f.write(getJobDetail(jobinfo) + '\n')
parmsAPI['pn'] += parmsAPI['rn']
if parmsAPI['rn'] == total_job_cnt:
break
total_job_cnt -= parmsAPI['rn']
if total_job_cnt <= parmsAPI['rn']:
parmsAPI['rn'] = total_job_cnt
content = s.get(baiduAPI,params=parmsAPI).json()
s.close()