# -*- coding: UTF-8 -*-
import re
from selenium import webdriver
import time
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pymongo
def element_find(xpath):
# 判断元素是否加载完成
try:
element = WebDriverWait(driver, 30).until(
EC.presence_of_element_located((By.XPATH, xpath)))
except Exception:
driver.quit()
print("元素加载超时")
return element
def get_hotel_list(city_id, arrive_time, leave_time):
# 得到地图页
driver.get("http://hotels.ctrip.com/international//" + city_id)
element_find('//*[@id="txtCheckIn"]').clear()
time.sleep(2)
element_find('//*[@id="txtCheckIn"]').send_keys(arrive_time)
time.sleep(2)
element_find('//*[@id="txtCheckOut"]').clear()
time.sleep(2)
element_find('//*[@id="txtCheckOut"]').send_keys(leave_time)
time.sleep(2)
element_find('//*[@id="side_inner"]/div[1]/div[1]/a').click()
time.sleep(2)
def get_info(res):
# 用于判断酒店信息存在
pattern = re.compile(r'<div class="side_list_item".*?</span></div></div></div>')
result = pattern.findall(res)
return result
def map_biggest():
# 将地图放大
driver.switch_to.frame(0)
element_find('//*[@id="map"]/div/div/div[10]/div[1]/div/button[2]').click()
time.sleep(1)
element_find('//*[@id="map"]/div/div/div[10]/div[1]/div/button[2]').click()
time.sleep(1)
element_find('//*[@id="map"]/div/div/div[10]/div[1]/div/button[2]').click()
time.sleep(1)
element_find('//*[@id="map"]/div/div/div[10]/div[1]/div/button[2]').click()
time.sleep(1)
element_find('//*[@id="map"]/div/div/div[10]/div[1]/div/button[2]').click()
time.sleep(1)
element_find('//*[@id="map"]/div/div/div[10]/div[1]/div/button[2]').click()
driver.switch_to.default_content()
def get_id(res):
# 正则匹配酒店id
pattern = re.compile(r'id=".*?"')
result = pattern.findall(res)
return result[0][4:-1]
def get_title(res):
# 正则匹配酒店名
pattern = re.compile(r'title=".*?" href')
result = pattern.findall(res)
return result[0][7:-6]
def get_price(res):
# 正则匹配酒店价格
pattern = re.compile(r'price=.*?" curr')
result = pattern.findall(res)
return result[0][7:-6]
def get_lat(res):
# 正则匹配酒店纬度
pattern = re.compile(r'pos=".*?\|')
result = pattern.findall(res)
return result[0][5:-1]
def get_lng(res):
# 正则匹配酒店经度
pattern = re.compile(r'\|.*?"')
result = pattern.findall(res)
return result[0][1:-1]
def get_pos(res):
# 正则匹配酒店经纬度
pattern = re.compile(r'pos=".*?"')
result = pattern.findall(res)
return result[0][5:-1]
def get_mark(res):
# 正则匹配酒店分数
pattern = re.compile(r'"b">.*?</span>')
result = pattern.findall(res)
return result[0][4:-7]
def get_url(res):
# 正则匹配酒店url
pattern = re.compile(r'href=".*?"')
result = pattern.findall(res)
return result[0][6:-1]
def get_hotel(res, total_count, page):
# 获得酒店全部信息并储存至数据库
for i in range(0, len(res)):
time.sleep(10)
element_find('//*[@id="' + get_id(res[i]) + '"]/div/span').click()
if i == 3 or 4:
element_find('//*[@id="' + get_id(res[i]) + '"]/div/span').click()
time.sleep(2)
driver.switch_to.frame("mapIframe")
time.sleep(2)
img_url = element_find('//*[@id="map"]/div/div/div[1]/div[3]/div/div[4]/div/div/div[1]/div/img').get_attribute('src')
time.sleep(2)
address = element_find('//*[@id="map"]/div/div/div[1]/div[3]/div/div[4]/div/div/div[2]/div[3]').text
time.sleep(2)
driver.switch_to.default_content()
hotel_info = {"_id": get_id(res[i]), "rank":get_mark(res[i]), "city_name": "首尔", "city_id": "seoul274"
, "pos": get_pos(res[i]), "address": address, "id": get_id(res[i]), "link": get_url(res[i])
, "area_name": "首尔", "areacode": "kr", "citycode": "seoul", "pic_url": img_url, "title": get_title(res[i])
, "paiming": int(page-1)*10+i+1, "total_page": total_count//10, "page": int(page), "total_count": total_count
}
time.sleep(2)
save_to_mongo(hotel_info)
time.sleep(2)
def save_to_mongo(result):
# 数据储存到mongodb
try:
if db[MONGO_TABLE].insert_one(result):
print('stroe succeed', result)
except Exception :
print('store failed', result)
MONGO_URl = 'localhost:27017'
MONGO_DB = 'xiecheng'
client = pymongo.MongoClient(MONGO_URl)
db = client[MONGO_DB]
MONGO_TABLE = 'seoul274'
if __name__ == "__main__":
chrome_options = Options()
chrome_options.add_argument('--headless')
driver = webdriver.Chrome(options=chrome_options)
# driver = webdriver.Chrome()
driver.maximize_window()
get_hotel_list("seoul274", "2019-09-16", "2019-09-17")
# 换至地图页
handle = driver.current_window_handle
handles = driver.window_handles
for new_handle in handles:
if new_handle != handle:
driver.switch_to.window(new_handle)
# 获得总酒店数量与页数
total_count = int(element_find('//*[@id="J_totalHotel"]').text)
pages = total_count // 10
pag = 1
while pag != pages:
print("page: " + str(pag))
time.sleep(2)
page = driver.page_source
res = get_info(page)
if res == []:
print("hotels disappear!")
time.sleep(10)
driver.get("https://hotels.ctrip.com/international/maplist/seoul274/p" + str(pag))
time.sleep(10)
else:
time.sleep(2)
map_biggest()
get_hotel(res, total_count, pag)
pag = pag + 1
time.sleep(2)
element_find('//*[@id="c_page_mini_next"]').click()
time.sleep(10)
driver.quit()
使用selenium与无头Chrome爬取携程酒店信息
©著作权归作者所有,转载或内容合作请联系作者
- 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
- 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
- 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
推荐阅读更多精彩内容
- selenium与chromedriver安装 安装chrome(有版本要求,linux和windows版本要求不...
- Selenium (浏览器自动化测试框架)介绍 Selenium是一个用于Web应用程序测试的工具。Seleniu...
- 上周,有同事跟我说:“听说你会写Python,那能请你帮我写个爬虫吗?” 之前虽然对爬虫也仅仅停留在最基础的地方,...
- 背景 最近一直在搞论坛的爬虫。爬着爬着,突然遇到一个论坛的反爬虫机制比较强。例如:http://bbs.nubia...