元素定位
requests返回的response是html格式,我们需要把需要的数据提取出来,那么就需要元素定位。常用的元素定位方式有xpath和css,如果你熟悉javascript,也可以使用pyquery。
相关的库有lxml,BeautifuleSoap(官方已经将BeautifulSoup改名为bs4了)。相关的教程太多了,这里为了完整性,举一个xpath例子,做个小总结。
例子是抓取美容下所有分类和具体项目的相关信息。
# -*- coding:utf-8 -*-
"""
File Name : 'Spider_soyoung'.py
Description:
Author: 'chengwei'
Date: '2016/4/22' '9:43'
"""
import sys
import requests
import json
import random
import redis
import logging
import pymssql
import copy
import datetime
import time
import json
from lxml import etree
import re
reload(sys)
sys.setdefaultencoding('utf8')
class Spider_plastics(object):
def __init__(self):
self.user_agents = ['Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533+ (KHTML, like Gecko) Element Browser 5.0',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14',
'Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5355d Safari/8536.25',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; TheWorld)',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36'
]
self.root_url = 'http://plastics.517mr.com/'
#log
self.logfilename = self.__class__.__name__ + '.log'
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s:%(message)s', datefmt='%m/%d/%Y %I:%M:%S %p',
filename=self.logfilename, filemode='a')
# SQL
self.conn = pymssql.connect(host='99.48.58.23', user='sa', password='123456', database='meirong', charset="utf8")
self.cur = self.conn.cursor(as_dict=True)
def get_detail_url(self):
user_agent = random.choice(self.user_agents)
header_2 = {
"User-Agent": user_agent
}
s = requests.Session()
url_list = []
html = s.get(self.root_url, headers=header_2)
time.sleep(3)
selector = etree.HTML(html.text)
content_1 = selector.xpath('//*[@id="zxmr"]//div[starts-with(@class,"xm_list")]')
content_2 = selector.xpath('//*[@id="pfmr"]//div[starts-with(@class,"xm_list")]')
content_3 = selector.xpath('//*[@id="zsmr"]//div[starts-with(@class,"xm_list")]')
content_4 = selector.xpath('//*[@id="jgmr"]//div[starts-with(@class,"xm_list")]')
content_5 = selector.xpath('//*[@id="sssx"]//div[starts-with(@class,"xm_list")]')
content_6 = selector.xpath('//*[@id="mfzz"]//div[starts-with(@class,"xm_list")]')
content_7 = selector.xpath('//*[@id="myjc"]//div[starts-with(@class,"xm_list")]')
content_8 = selector.xpath('//*[@id="zymr"]//div[starts-with(@class,"xm_list")]')
content_9 = selector.xpath('//*[@id="sbxf"]//div[starts-with(@class,"xm_list")]')
temp_list = [
{'type': u'整形美容', 'content': content_1},
{'type': u'皮肤美容', 'content': content_2},
{'type': u'注射美容', 'content': content_3},
{'type': u'激光美容', 'content': content_4},
{'type': u'瘦身美容', 'content': content_5},
{'type': u'毛发种植', 'content': content_6},
{'type': u'美牙健齿', 'content': content_7},
{'type': u'中医美容', 'content': content_8},
{'type': u'失败修复', 'content': content_9}
]
for item in temp_list:
for element in item['content']:
link = element.xpath('.//a/@href')
i = 3
for m in range(0, len(link)):
if m == 0:
continue
else:
item_dict = {}
item_dict['categories'] = item['type']
name = element.xpath('string(.)').replace(' ', '').replace('\t', '').strip().split('\n')
link = element.xpath('.//a/@href')
item_dict['location'] = name[0]
item_dict['project_classification'] = name[i]
item_dict['url'] = link[m]
i += 1
url_list.append(copy.deepcopy(item_dict))
time.sleep(0.1)
s.close()
return url_list
def get_detail_info(self):
user_agent = random.choice(self.user_agents)
header_2 = {
"User-Agent": user_agent
}
url_list = self.get_detail_url()
s = requests.Session()
n = 0
for item in url_list:
n += 1
if n == 153:
print "test"
res = s.get(item['url'], headers=header_2)
if res.status_code == 200:
selector = etree.HTML(res.text)
content = selector.xpath('//*[@id="catelist"]//div[@class = "diy_tr"]')
content_2 = selector.xpath('//div[@class = "price"]/em')
else:
logging.error("%s:%d" %(item['url'], res.status_code))
continue
if len(content) != 0:
for element in content:
info_1 = element.xpath('./span[@class = "w1 outer"]')[0].xpath('string(.)').strip().split('\n')[0]
info_2 = element.xpath('./span[@class = "w3 outer"]')[0].xpath('string(.)').strip()
info_3 = element.xpath('./span[@class = "w4 outer"]')[0].xpath('string(.)').strip()
info_4 = element.xpath('./span[@class = "w5 outer"]')[0].xpath('string(.)').strip()
info_5 = element.xpath('./span[@class = "w6 outer"]')
temp_dict = {}
temp_dict['categories'] = item['categories']
temp_dict['location'] = item['location']
temp_dict['project_classification'] = item['project_classification']
temp_dict['feature'] = info_1
temp_dict['apply_to'] = info_2
temp_dict['price'] = info_3
temp_dict['refresh_cycle'] = info_4
temp_dict['attention'] = len(info_5[0].xpath('.//div[@class = "c6"]/em[@class = "x"]'))
time.sleep(0.6)
try:
sql = "INSERT INTO kanghua (categories, location, project_classification, feature, " \
"apply_to, price, refresh_cycle, attention) VALUES ('%s','%s','%s', '%s', '%s', " \
"'%s','%s','%s')" %(temp_dict['categories'],temp_dict['location'],temp_dict['project_classification'],
temp_dict['feature'], temp_dict['apply_to'],temp_dict['price'], temp_dict['refresh_cycle'],
temp_dict['attention'])
self.cur.execute(sql)
self.conn.commit()
except:
logging.error(" 第一种布局 INSERT ERROR")
continue
if len(content_2) != 0:
logging.info("another css:%s %s %s" %(item['categories'], item['location'], item['project_classification']))
temp_dict = {}
temp_dict['price'] = selector.xpath('//div[@class = "price"]/em')[0].xpath('string(.)')
temp_dict['categories'] = item['categories']
temp_dict['location'] = item['location']
temp_dict['project_classification'] = item['project_classification']
temp_dict['feature'] = ''
temp_dict['apply_to'] = ''
temp_dict['refresh_cycle'] = ''
temp_dict['attention'] = ''
time.sleep(0.6)
try:
sql = "INSERT INTO kanghua (categories, location, project_classification, feature, " \
"apply_to, price, refresh_cycle, attention) VALUES ('%s','%s','%s', '%s', " \
"'%s', '%s','%s','%s')" %(temp_dict['categories'],temp_dict['location'],
temp_dict['project_classification'],temp_dict['feature'],
temp_dict['apply_to'],temp_dict['price'], temp_dict['refresh_cycle'],
temp_dict['attention'])
self.cur.execute(sql)
self.conn.commit()
except:
logging.error("第二种布局 INSERT ERROR")
else:
logging.error("error:%s %s %s" %(item['categories'], item['location'], item['project_classification']))
test = Spider_plastics()
test.get_detail_info()
xpath说明:
基本语法可参考W3CSchool
获取某个节点下的所有文本可以使用string(.)
element.xpath('string(.)')
- 常用的功能函数
starts-with
//div[starts-with(@id,'res')]
contains和and(.代表当前节点,..表示父节点)
//span[contains(.,'_Test') and contains(.,'KPI')]
- charome插件XPather,测试xpath的好工具
- beautifulSoap文档