本次目标是将客路网商品爬取,并储存至MongoDB中
源代码
import requests
import pymongo
import re
import json
import pandas as pd
import time
import random
def getheaders():
user_agent_list = ["Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)",
'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
'Opera/9.25 (Windows NT 5.1; U; en)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
"Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7",
"Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 "]
UserAgent=random.choice(user_agent_list)
return UserAgent
headers = {
"cookie": "abtest_revamp=1563875646033; device_id_new=ryEGX8eZpJ00300000000000005B8Gc9qXKS00314982965WpYWiKzBGKGAWkn1DGD5S16Goh5Mk004Kht7irbDUr00000YVxEr0000041IK5K68xk78dzoTmemq:40::f9905f43e6590003; _gcl_au=1.1.31383824.1563875650; tag_fok=1563875648000; _ga=GA1.2.1742545335.1563875651; _pxvid=e628e4c7-ad2f-11e9-bcd0-0242ac12000e; klk_lang=zh-CN; __stripe_mid=beb6214f-fedc-4ec7-bd1e-8a336755b064; _gcl_aw=GCL.1563934680.EAIaIQobChMIoc6V0L7M4wIVCa6WCh2V5QA6EAAYASAAEgJJSPD_BwE; _gac_UA-54803406-1=1.1563934680.EAIaIQobChMIoc6V0L7M4wIVCa6WCh2V5QA6EAAYASAAEgJJSPD_BwE; _gac_UA-86696233-1=1.1563934685.EAIaIQobChMIoc6V0L7M4wIVCa6WCh2V5QA6EAAYASAAEgJJSPD_BwE; klk_currency=CNY; _gid=GA1.2.389333073.1565751780; px-abgroup=A; px-abper=100; webp_support=1; retina_support=0; JSESSIONID=280B2DDE101FA68E5B8D0A2BA2695BC0; CSRF-Token=MTU2NTkzNTA5NXxOREFWd2tQeldtTERCWXZaTW9ucjdJTXJXR05Xc1drQ3w47gIhUzoBKj9lGBxwUkIl6sSj0_z_cw8tPAMy6kA9bw==; CSRF-Token-Valid=valid; mp_c2ca8b423fd75a10792debf44cd6b51a_mixpanel=%7B%22distinct_id%22%3A%20%2216c1e409f9f2cc-08e5cb5145a7c4-37607c04-13c680-16c1e409fa031c%22%2C%22%24device_id%22%3A%20%2216c1e409f9f2cc-08e5cb5145a7c4-37607c04-13c680-16c1e409fa031c%22%2C%22%24search_engine%22%3A%20%22google%22%2C%22%24initial_referrer%22%3A%20%22https%3A%2F%2Fwww.google.com%2F%22%2C%22%24initial_referring_domain%22%3A%20%22www.google.com%22%2C%22Language%22%3A%20%22zh-CN%22%2C%22Platform%22%3A%20%22Web%22%2C%22Backend%20User%20Country%22%3A%20%22CN%22%2C%22Test-WS2199%22%3A%20%22variant%22%2C%22Page%20Type%22%3A%20%22Destination%20Page%22%2C%22__timers%22%3A%20%7B%7D%2C%22Login%20Status%22%3A%20false%2C%22Test-3%22%3A%20%22variant-10%22%2C%22Test-14%22%3A%20%22variant-55%22%2C%22Test-WS2196%22%3A%20%22control%22%2C%22Test-WS2350%22%3A%20%22variant%22%2C%22'Test-BB1%22%3A%20%22control%22%2C%22Test-12%22%3A%20%22control%22%2C%22Test-23%22%3A%20%22variant-70%22%2C%22Test-AAAAA%22%3A%20%22variant%22%2C%22Test-24%22%3A%20%22control%22%2C%22Test-25%22%3A%20%22control%22%2C%22Test-26%22%3A%20%22variant-75%22%2C%22WS-2515%22%3A%20%22WS-2515-variant1%22%2C%22WS-2351%22%3A%20%22WS-2351-variant1%22%7D; wcs_bt=s_2cb388a4aa34:1565935116; _px3=51c25ddccc41460714b0c77f9086094ebca4547fe6aff217bada6cd0a71b9cda:1K7R1eyox+K6FywON0Wjpr/BvHj0YRXaQx9pH45gDDO4QEcYa7eI+hSsgvjvtAdRFfNFo/12w1i3MBbgQsHVhA==:1000:4duS7MSxB3gm7SQSqY7aj6Hnnyzqw2hPcZl8z6X6Ee56B7pT4yuuroAOE6n43zXK+D22dsZWIFh4kp3252pn2sm9khCmkHbsNckMqPyDeKKSVWjo/8QOfv+t2pDd0D6nVliwyxyI5OVY9hhoBdkkKJS41SwORVvfALvpEDnEnBg="
, "user-agent": getheaders()
, "Sec-Fetch-Mode": "cors"
}
def get_proxy():
return requests.get("http://127.0.0.1:5055/get/").json()
def delete_proxy(proxy):
requests.get("http://127.0.0.1:5055/delete/?proxy={}".format(proxy))
def getHtml(product_id):
# ....
retry_count = 5
proxy = get_proxy().get("proxy")
print(proxy)
while retry_count > 0:
try:
html = requests.get('https://www.klook.com/zh-CN/activity/' + str(product_id), proxies={"http": "http://{}".format(proxy)}, headers=headers)
print(html)
# 使用代理访问
return html.text
except Exception:
retry_count -= 1
# 出错5次, 删除代理池中代理
delete_proxy(proxy)
return None
def save_to_Mongo(result):
# 数据储存到mongodb
try:
if db[MONGO_TABLE].insert(result):
print('存储到MongoDB成功', result)
except Exception:
print('存储到MongoDb失败', result)
def get_product(product_id):
print("商品", str(product_id))
res = getHtml(product_id)
pattern = re.compile(r'activityInfo.*?"noindex":false')
result = pattern.findall(res)
if result == []:
pattern = re.compile(r'activityInfo.*?"noindex":true')
result = pattern.findall(res)
product_data = result[0][15:] + '}'
product_json = json.loads(product_data)
product_json['_id'] = product_json.pop("id")
save_to_Mongo(product_json)
def get_product_id():
client = pymongo.MongoClient('localhost', 27017)
db = client['klook']
table = db['products_id']
data = pd.DataFrame(list(table.find()))
id_list = data['_id']
return id_list
MONGO_URl = 'localhost:27017'
MONGO_DB = 'klook'
client = pymongo.MongoClient(MONGO_URl)
db = client[MONGO_DB]
MONGO_TABLE = 'products'
for i in range(235, len(get_product_id())):
time.sleep(5)
get_product(get_product_id()[i])