直接上代码例子:
import scrapy
import requests
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.http import Request, FormRequest, HtmlResponse
from spiderman.items import SpidermanItem
class ItjuziWebSpider(CrawlSpider):
name = "itjuziweb"
allowed_domains = ["itjuzi.com"]
start_urls =["https://www.itjuzi.com/user/login"]
rules = (
Rule(LinkExtractor(allow=('http://www.itjuzi.com/company\?page=\d+', ))),
Rule(LinkExtractor(allow=('http://www.itjuzi.com/company/\d+', )), callback='parse_item'),
)
def __init__(self, *a, **kw):
super(ItjuziWebSpider, self).__init__(*a, **kw)
self.cookie = ''
self.resp = requests.Session()
self.request_headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
"Referer": "https://www.itjuzi.com",
}
self.post_headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.8,en;q=0.6",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Content-Type": "application/x-www-form-urlencoded",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
"Referer": "https://www.itjuzi.com/",
}
def parse_start_url(self, response):
# 登陆成功后, 会调用after_login回调函数
print('Preparing login')
url = 'https://www.itjuzi.com/user/login'
post_data = {
'identity':'linzikristy@qq.com',
'password':'a761177953z',
'remember':'1',
'page':'',
'url':'',
}
r = self.resp.post(url, headers=self.post_headers, data=post_data)
cookie_dict = {}
for x in r.cookies:
cookie_dict[x.name] = x.value
self.cookie = cookie_dict
# self.after_login()
yield Request('http://www.itjuzi.com/company?page=1',headers=self.post_headers, cookies=self.cookie)
def parse_item(self, response):
self.log('Hi, this is an item page! %s' % response.url)
item = SpidermanItem()
item['item_id'] = response.xpath("//*[@id='modal_myinc']/div/div/div[2]/div/form/input/@value").extract()[0]
item['item_name'] = response.xpath("//input[contains(@name,'com_name')]/@value").extract()[0]
item_logo = response.xpath("//div[contains(@class,'rowhead')]/div[contains(@class,'pic')]/img/@src").extract()
if item_logo is None:
item['item_logo'] = ''
else:
item['item_logo'] = ','.join(item_logo)
item['item_brief'] = response.xpath("//meta[@name='Description']/@content").extract()[0]
item_area = response.xpath("//div[contains(@class,'tagset dbi c-gray-aset')]/a/span/text()").extract()
if item_area is None:
item['item_area'] = ''
else:
item['item_area'] = ','.join(item_area)
item_CEO = response.xpath("//a[contains(@class,'title')]//span[contains(@class,'c')][1]/text()").extract()
if item_CEO is None:
item['item_CEO'] = ''
else:
item['item_CEO'] = ','.join(item_CEO)
item_round = response.xpath("//span[contains(@class,'t-small c-green')]/text()").extract()
if item_round is None:
item['item_round'] = ''
else:
item['item_round'] = ','.join(item_round).strip(',').strip('\n').strip('\t').strip('\n').strip('(').strip(')');
item_website = response.xpath("//input[contains(@name,'com_url')]/@value").extract()
if item_website is None:
item['item_website'] = ''
else:
item['item_website'] = ','.join(item_website).strip(',')
item['item_from'] = 'IT桔子'
item['item_phone'] = ''
item['item_email'] = ''
item_weixin = response.xpath("//li[@class='wx-text']/a/text()").extract()
if item_weixin is None:
item['item_weixin'] = ''
else:
item['item_weixin'] = ','.join(item_weixin).strip(',').strip(' ')
item_weibo = response.xpath("//div[@class='link-line']/a[1]/@href").extract()
if item_weibo is None:
item['item_weibo'] = ''
else:
item['item_weibo'] = ','.join(item_weibo).strip(',').strip(' ')
item['item_from_website'] = response.url
item_address = response.xpath("//span[contains(@class,'loca c-gray-aset')]/a/text()").extract()
if item_address is None:
item['item_address'] = ''
else:
item['item_address'] = ','.join(item_address).strip(',').strip(' ')
return item