PYTHON实战计划第二周实战作业：爬取10万商品数据

成果展示

详情页链接

Paste_Image.png

商品信息

Paste_Image.png

我的代码

主函数（main.py)

#-*- coding:utf-8 -*-
from multiprocessing import Pool
from channel_extact import channel_list
from ganji_url_info import get_url_link,url_links,get_goods_info,goodsinfo

#断点续传判断
download_Y = [item['url'] for item in goodsinfo.find()] # 相关链接数据已下载至数据库
download_N = [item['url'] for item in url_links.find()] # 完整数据链接
Y = set(download_Y) #集合化
N = set(download_N) #集合化
need_to_download = N-Y  # 还未下载的链接

#def get_all_links(channel):
#    for page in range(1,101):
#        get_url_link(channel,page)


if __name__ == '__main__':
    # 利用多进程来爬取
    #pool = Pool()
    #pool.map(get_url_link,channel_list)
    #pool.map(get_goods_info,need_to_download)
    #pool.close()
    #pool.join()

    #不用pool，直接爬取
    #for url in need_to_download:
    #    get_goods_info(url)

获取代理(My_proxies.py)

import requests
from bs4 import BeautifulSoup
import random

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36',
    'Connection': 'keep-alive',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate, sdch',
}


def get_proxies():
    url = 'http://www.xicidaili.com/nn'
    proxies_list = []  # 存储代理IP
    wb_data = requests.get(url, headers=headers).text
    soup = BeautifulSoup(wb_data, 'lxml')
    ips = soup.select('tr.odd > td:nth-of-type(2)')  # ip地址
    ports = soup.select('tr.odd > td:nth-of-type(3)')  # 端口号
    speeds = soup.select('tr > td:nth-of-type(7) > div > div')  # 速度
    connect_times = soup.select('tr > td:nth-of-type(8) > div > div')  # 连接速度
    # 信息合并，且筛选出速度快的代理
    for ip, port, speed, connect_time in zip(ips, ports, speeds, connect_times):
        if speed.get('class')[1] == 'fast' and connect_time.get('class')[1] == 'fast':
            proxies_list.append('http://' + str(ip.text) + ':' + str(port.text))
        else:
            continue
    print(proxies_list)

get_proxies()

取得分类链接 (channel_extact.py)

# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
import requests


url = 'http://bj.ganji.com/wu/'
url_host = 'http://bj.ganji.com'

def get_channel_link(url):
    wb_data = requests.get(url)
    soup = BeautifulSoup(wb_data.text,'lxml')
    channel_links = soup.select('dl.fenlei > dt > a')
    #print(channel_links)
    for channel in channel_links:
        print(url_host + channel.get('href'))

channel_list ='''
    http://bj.ganji.com/jiaju/
    http://bj.ganji.com/rirongbaihuo/
    http://bj.ganji.com/shouji/
    http://bj.ganji.com/bangong/
    http://bj.ganji.com/nongyongpin/
    http://bj.ganji.com/jiadian/
    http://bj.ganji.com/ershoubijibendiannao/
    http://bj.ganji.com/ruanjiantushu/
    http://bj.ganji.com/yingyouyunfu/
    http://bj.ganji.com/diannao/
    http://bj.ganji.com/xianzhilipin/
    http://bj.ganji.com/fushixiaobaxuemao/
    http://bj.ganji.com/meironghuazhuang/
    http://bj.ganji.com/shuma/
    http://bj.ganji.com/laonianyongpin/
    http://bj.ganji.com/xuniwupin/
'''
#以下三项分类格式与上面的不统一
#http://bj.ganji.com/qitawupin/
#http://bj.ganji.com/ershoufree/
#http://bj.ganji.com/wupinjiaohuan/

#get_channel_link(url)

取各分类链接列表并取得详情页信息(ganji_url_info.py)

# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
import requests
import pymongo
import time , random
import requests.exceptions


client = pymongo.MongoClient('localhost', 27017)
ganji = client['ganji']
url_links = ganji['url_links_2']
url_links_zz = ganji['url_links_zz_2']
goodsinfo = ganji['goodsinfos']

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36',
    'Connection': 'keep-alive',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate, sdch',
}

# http://www.xicidaili.com/wn
proxy_list =['http://121.193.143.249:80',
             'http://42.159.251.84:41795',
             'http://119.6.136.122:80',
             'http://101.201.235.141:8000',
             'http://118.180.15.152:8102',
             'http://123.57.190.51:7777'
    ]
proxy = random.choice(proxy_list) # 随机选择代理
proxies = {'http': proxy}

#爬取网页链接
def get_url_link(channel, page, who_sells='o'):
    try:
        url_link = '{}{}{}/'.format(channel, str(who_sells), str(page))
        wb_data = requests.get(url_link, headers=headers)
        time.sleep(1)
        soup = BeautifulSoup(wb_data.text, 'lxml')
    except (requests.exceptions.ProxyError, requests.exceptions.ConnectionError,requests.exceptions.ReadTimeout) as e:
        print('This is a error raise')


    # 判断该网页是否有效
    url_right = soup.select('ul.pageLink.clearfix') == []
    if url_right:
        pass
    else:
        data_1 = soup.select('li.js-item > a')  # 赶集网相关链接信息所在
        #print(data_1)
        data_2 = soup.select('div.zz-til > a')  # 58转转相关链接信息所在
        # 将标题及对应链接存入url_links
        for data in data_1:
            if 'biz.click.ganji.com' not in data.get('href'):
                url_links.insert_one({'title': data.get_text(strip=True), 'url': data.get('href')})
                print({'title': data.get_text(strip=True), 'url': data.get('href')})
        # 将标题及对应链接存入url_links_zz
        for data in data_2:
            url_links_zz.insert_one({'title': data.get_text(strip=True), 'url': data.get('href')})
            # print({'title': data.get_text(strip=True), 'url': data.get('href').split('?')[0]})


#爬取详情信息
def get_goods_info(url):
    try:
        wb_data = requests.get(url, headers=headers, proxies=proxies).text
        soup = BeautifulSoup(wb_data, 'lxml')
        # 判断页面是否正常，若页面商品信息已删除或无效，跳过
        if soup.select('div.error'):
            print(url)
            print('This page is Not Found!')
        else:
            title = soup.select('h1.title-name')[0].get_text() if soup.select('h1.title-name') else None  # 标题
            # 发布时间
            if soup.select('i.pr-5'):
                published = soup.select('i.pr-5')[0].get_text(strip=True)
            else:
                published = None
            goods_types = soup.select('div > ul.det-infor > li:nth-of-type(1) > span > a')
            goods_type = [i.get_text(strip=True) for i in goods_types]  # 商品类型
            locations = soup.select('div > ul.det-infor > li:nth-of-type(3) > a')
            location = [i.get_text(strip=True) for i in locations]  # 交易地点
            price = soup.select('i.f22.fc-orange.f-type')[0].get_text() \
                if soup.select('i.f22.fc-orange.f-type') else None  # 价格
            if len(soup.select('body > div > div > div.h-crumbs > div > a')) >= 3:
                classfy = soup.select('body > div > div > div.h-crumbs > div > a')[2].text
            else:
                classfy = None
            # 判断是否有该字段值
            if soup.find(text='新旧程度：'):
                degree = soup.select('ul.second-det-infor.clearfix > li')[0].get_text().split()[-1]  # 新旧程度
            else:
                degree = None

            #print(title,published,goods_type,location,price,classfy)
            # 保存数据至数据库
            # 爬取不到关键信息，信息不入数据库，待下次处理
            if title or published or price:
                goodsinfo.insert_one({'title': title,
                                      'published': published,
                                      'goods_type': goods_type,
                                      'location': location,
                                      'price': price,
                                      'degree': degree,
                                      'url': url,  # 用于后面判断还未下载的链接,
                                      'classfy': classfy
                                      })
                print(
                    {'title': title,
                     'published': published,
                     'goods_type': goods_type,
                     'location': location,
                     'price': price,
                     'degree': degree,
                     'url': url,
                     'classfy': classfy
                     }
                )
            else:
                pass
    except (requests.exceptions.ProxyError, requests.exceptions.ConnectionError,requests.exceptions.ReadTimeout) as e:
        print('This is a error raise')



#url='http://bj.ganji.com/yingyouyunfu/2285918732x.htm'
#get_goods_info(url)

总结：

设计过程中不断的遇到问题，不断的在代码添加新功能解决，使程序尽量完善，主要问题如下：
|- 同一IP访问次数过多，被网站封了，设计了一个从代理网站获取代理IP的程序，来解决代理IP的问题；
|- 爬取过程中各种请求失效，超时等报错影响程序运行的连续，利用try-except，将此种报错获取，暂时跳过此网页，继续往下运行，后面再来处理这些未成功的网页
|- 加入了判断页面已失效（404）或页面内容读取失败的判断，跳过这些页面，增加效率
|- 爬取的详情页链接中，存在许多跳转到转转的链接，将此类链接另外存于Mongo的另一集合中，因页面不同，如需获得这些链接的详情，需另设计函数来获取此类链接详情信息

最后编辑于：2017.12.04 01:35:02

人面猴
序言：七十年代末，一起剥皮案震惊了整个滨河市，随后出现的几起案子，更是在滨河造成了极大的恐慌，老刑警刘岩，带你破解...
沈念sama阅读 203,098评论 5赞 476
死咒
序言：滨河连续发生了三起死亡事件，死亡现场离奇诡异，居然都是意外死亡，警方通过查阅死者的电脑和手机，发现死者居然都...
沈念sama阅读 85,213评论 2赞 380
救了他两次的神仙让他今天三更去死
文/潘晓璐我一进店门，熙熙楼的掌柜王于贵愁眉苦脸地迎上来，“玉大人，你说我怎么就摊上这事。” “怎么了？”我有些...
开封第一讲书人阅读 149,960评论 0赞 336
道士缉凶录：失踪的卖姜人
文/不坏的土叔我叫张陵，是天一观的道长。经常有香客问我，道长，这世上最难降的妖魔是什么？我笑而不...
开封第一讲书人阅读 54,519评论 1赞 273
港岛之恋（遗憾婚礼）
正文为了忘掉前任，我火速办了婚礼，结果婚礼上，老公的妹妹穿的比我还像新娘。我一直安慰自己，他们只是感情好，可当我...
茶点故事阅读 63,512评论 5赞 364
恶毒庶女顶嫁案：这布局不是一般人想出来的
文/花漫我一把揭开白布。她就那样静静地躺着，像睡着了一般。火红的嫁衣衬着肌肤如雪。梳的纹丝不乱的头发上，一...
开封第一讲书人阅读 48,533评论 1赞 281
城市分裂传说
那天，我揣着相机与录音，去河边找鬼。笑死，一个胖子当着我的面吹牛，可吹牛的内容都是我干的。我是一名探鬼主播，决...
沈念sama阅读 37,914评论 3赞 395
双鸳鸯连环套：你想象不到人心有多黑
文/苍兰香墨我猛地睁开眼，长吁一口气：“原来是场噩梦啊……” “哼！你这毒妇竟也来了？” 一声冷哼从身侧响起，我...
开封第一讲书人阅读 36,574评论 0赞 256
万荣杀人案实录
序言：老挝万荣一对情侣失踪，失踪者是张志新（化名）和其女友刘颖，没想到半个月后，有当地人在树林里发现了一具尸体，经...
沈念sama阅读 40,804评论 1赞 296
护林员之死
正文独居荒郊野岭守林人离奇死亡，尸身上长有42处带血的脓包…… 初始之章·张勋以下内容为张勋视角年9月15日...
茶点故事阅读 35,563评论 2赞 319
白月光启示录
正文我和宋清朗相恋三年，在试婚纱的时候发现自己被绿了。大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
茶点故事阅读 37,644评论 1赞 329
活死人
序言：一个原本活蹦乱跳的男人离奇死亡，死状恐怖，灵堂内的尸体忽然破棺而出，到底是诈尸还是另有隐情，我是刑警宁泽，带...
沈念sama阅读 33,350评论 4赞 318
日本核电站爆炸内幕
正文年R本政府宣布，位于F岛的核电站，受9级特大地震影响，放射性物质发生泄漏。R本人自食恶果不足惜，却给世界环境...
茶点故事阅读 38,933评论 3赞 307
男人毒药：我在死后第九天来索命
文/蒙蒙一、第九天我趴在偏房一处隐蔽的房顶上张望。院中可真热闹，春花似锦、人声如沸。这庄子的主人今日做“春日...
开封第一讲书人阅读 29,908评论 0赞 19
一桩弑父案，背后竟有这般阴谋
文/苍兰香墨我抬头看了看天上的太阳。三九已至，却和暖如春，着一层夹袄步出监牢的瞬间，已是汗流浃背。一阵脚步声响...
开封第一讲书人阅读 31,146评论 1赞 259
情欲美人皮
我被黑心中介骗来泰国打工，没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留，地道东北人。一个月前我还...
沈念sama阅读 42,847评论 2赞 349
代替公主和亲
正文我出身青楼，却偏偏与公主长得像，于是被迫代替她去往敌国和亲。传闻我的和亲对象是个残疾皇子，可洞房花烛夜当晚...
茶点故事阅读 42,361评论 2赞 342

PYTHON实战计划第二周实战作业：爬取10万商品数据

成果展示

我的代码

总结：

推荐阅读更多精彩内容