第一种:正则表达式
import requests
params = {"wd": "杨幂"}
url = "https://www.baidu.com/s?"
headers = {
'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0'
}
response = requests.get(url=url, headers=headers, params=params)
# 请求头
print(response.request.headers)
# 返回编码格式
print(response.encoding)
response = response.content
第二种:xpath ;
1.先安装:
pip3 install lxml
2.导入包:
import requests
from lxml import etree
import json
class QiushiSpider():
def __init__(self):
self.pg = 1
self.url = "https://www.qiushibaike.com/text/"
self.headers = {
"User-Agent": 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0'
}
def locapage(self, url):
response = requests.get(url=url, headers=self.headers)
html = response.content.decode('utf-8')
content = etree.HTML(html)
parent = content.xpath('//div[@class="col1"]/div')
for son in parent:
name = son.xpath(".//h2/text()")[0]
contents = '.'.join(son.xpath('.//div[@class="content"]//span/text()'))
smile = son.xpath('.//div[@class="stats"]/span/i/text()')[0]
comments = son.xpath('.//div[@class="stats"]/span/a/i/text()')[0]
dict = {
'name': name.strip(),
'content': contents.strip(),
'smile': smile,
'comments': comments,
}
self.save(dict)
# if self.pg < int(page):
# self.pg += 1
# self.start()
def save(self, dict):
with open("qiushi.json", "a") as f:
f.write(json.dumps(dict, ensure_ascii=False) + "\n")
def start(self):
full_url = self.url + str(self.pg) + "/"
self.locapage(full_url)
if __name__ == "__main__":
qs = QiushiSpider()
qs.start()
第三种:bs4(BeatifulSoup),使用选择器获取节点对象 ;
import requests
from bs4 import BeautifulSoup
import json
def locapage():
url = 'https://hr.tencent.com/position.php?&start=0'
headers = {
"User-Agent": 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0'
}
response = requests.get(url=url, headers=headers)
html = response.content.decode('utf-8')
bs = BeautifulSoup(html, 'lxml')
tr = bs.select('tr[class="even"]')
tr1 = bs.select('.odd')
# tr = bs.find_all('tr', class_="even")
# tr1 = bs.find_all('tr', class_="odd")
result = tr + tr1
for job in result:
name = job.select('td a')[0].get_text()
type = job.select('td')[1].get_text()
num = job.select('td')[2].get_text()
address = job.select('td')[3].get_text()
time = job.select('td')[4].get_text()
dict = {
"name": name,
"type": type,
"num": num,
"address": address,
"time": time,
}
save(dict)
def save(content):
with open('job.json', 'a') as f:
f.write(json.dumps(content, ensure_ascii=False) + "\n")
if __name__ == '__main__':
locapage()