Python-网络爬虫爬取陶素药物价格

定义：

网络爬虫（又被称为网页蜘蛛，是一种按照一定的规则，自动地抓取万维网信息的程序或者脚本。（来源于百度,懒得写定义）
目的：

爬取陶素官网药物的售价信息
https://tsbiochem.com/alltargets
工具

python的爬虫工具很多，这次使用Beautiful Soup为例，简单，快速。
步骤：

#安装
conda install -c conda-forge beautifulsoup4
conda install requests
#导入包
import os, time, random,sys,re
import requests
import bs4
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
#requests获取网页，BeautifulSoup获取网页状态
response = requests.get('https://tsbiochem.com/alltargets')
soup = BeautifulSoup(response.content, "html.parser")
soup.prettify
#显示
<bound method Tag.prettify of <!DOCTYPE html>

<html>
<head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="IE=Edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no" name="viewport"/>
<meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
<meta content="true" name="HandheldFriendly"/>
<link href="/css/bootstrap.min.css" media="all" rel="stylesheet" type="text/css"/>
<link href="/images/ticontransparent.png" rel="icon"/>
<link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css" rel="stylesheet"/>
<link href="/css/tsweb_base.css?ver=1.3" rel="stylesheet" type="text/css">
<link href="/css/tsweb_layout.css?ver=1.35" rel="stylesheet" type="text/css">
<link href="/css/tsweb_shopcart.css" rel="stylesheet" type="text/css">
<link href="/css/tsweb_orderway.css" rel="stylesheet" type="text/css"/>
<link href="/css/tsweb_compound.css" rel="stylesheet" type="text/css"/>
<link href="/css/tsweb_library.css" rel="stylesheet" type="text/css"/>
<link href="/css/tsweb_alltargets.css" rel="stylesheet" type="text/css"/>
<link href="/css/tsweb_pathway.css" rel="stylesheet" type="text/css"/>
<link href="/css/tsweb_search.css" rel="stylesheet" type="text/css"/>
<link href="/css/tsweb_target.css" rel="stylesheet" type="text/css"/>
<link href="/css/tsweb_contact.css" rel="stylesheet" type="text/css"/>
<link href="/css/tsweb_about.css" rel="stylesheet" type="text/css"/>
<link href="/css/tsweb_partner.css" rel="stylesheet" type="text/css"/>
<link href="/css/tsweb_calculator.css" rel="stylesheet" type="text/css"/>
<link href="/css/tsweb_news.css" rel="stylesheet" type="text/css"/>
<link href="/css/tsweb_join_us.css" rel="stylesheet" type="text/css"/>
<link href="/css/tsweb_faq.css" rel="stylesheet" type="text/css"/>
<link href="/css/tsweb_alllibraries.css" rel="stylesheet" type="text/css"/>
<script src="/js/jquery-3.1.1.min.js"></script>
<script src="/js/bootstrap.min.js"></script>
<script src="/js/tsweb_base.js"></script>
<meta content="dtb30YDZRhiTZGGnIbEL36fKUTMGnLcU7DTY1Sdd" name="csrf-token"/>
<title>通路靶点 | 陶素生化</title>
#显示我们需要的内容，就是提取属性为’content_a_en target_a‘的内容，获取其网址链接
body=soup.find_all(class_="content_a_en target_a")
http1=[x['href'] for  x in body]
http1
#结果
'https://tsbiochem.com/target/Salt-Inducible%20Kinase',
 'https://tsbiochem.com/target/Serine-Protease',
 'https://tsbiochem.com/target/Serine-threonin-kinase',
 'https://tsbiochem.com/target/SGK',
 'https://tsbiochem.com/target/SGLT',
 'https://tsbiochem.com/target/Sigma-receptor',
 'https://tsbiochem.com/target/Sirtuin',
 'https://tsbiochem.com/target/Sodium-Channel',
 'https://tsbiochem.com/target/Somatostatin',
 'https://tsbiochem.com/target/Src',
 'https://tsbiochem.com/target/STAT',
 'https://tsbiochem.com/target/Survivin',
 'https://tsbiochem.com/target/Syk',
 'https://tsbiochem.com/target/TAM-Receptor',
 'https://tsbiochem.com/target/Telomerase',
 'https://tsbiochem.com/target/TGF-beta-Smad',
 'https://tsbiochem.com/target/Thioredoxin',
 'https://tsbiochem.com/target/Thrombin',

#转换接头（UserAgent），防止反爬虫，
http2=[]
for i in range(len(http1)):
    ua=UserAgent()
    headers={"User-Agent":ua.random}
    response1 = requests.get(http1[i],headers=headers)
    soup1 = BeautifulSoup(response1.content, "html.parser")
    body1=soup1.find_all('td')
#发现所有‘td’的标签，储存为body1
    for x in range(len(body1)):
        try:
            c=body1[x].find('a')
#找寻body1中所有为‘a’的标签，存储为c
            http2.append(c['href'])
#找寻c中‘href’的属性，储存静茹列表http2
        except:
            True
#将http2转化为一个无序，无重复的列表
http3=list(set(http2))

#看下我们都做了什么，我这里直接展示最后的结果，如果你想看看每一步都发生了什么，取http1中的一个元素测试一下就好
http3[1:10]
#结果
['https://tsbiochem.com/compound/N-Acetyl-5-hydroxytryptamine',
 'https://tsbiochem.com/compound/Atipamezole',
 'https://tsbiochem.com/compound/Zacopride%20hydrochloride',
 'https://tsbiochem.com/compound/GTS-21-dihydrochloride',
 'https://tsbiochem.com/compound/Carvedilol',
 'https://tsbiochem.com/compound/Isoetharine%20mesylate%20salt',
 'https://tsbiochem.com/compound/Sotalol-hydrochloride',
 'https://tsbiochem.com/compound/Urapidil-hydrochloride',
 'https://tsbiochem.com/compound/Pirenzepine-hydrochloride']
#获取价格以及规格信息，使用字典格式储存
fin_dict={}
#获取网页
for i in range(len(http3)):
        ua=UserAgent()
        headers={"User-Agent":ua.random}
        response3 = requests.get(http3[i],headers=headers)
        soup3 = BeautifulSoup(response3.content, "html.parser")
#获取CAS编号
        if re.search(r'\s\d+-\d+-\d+\s',str(soup3)):
           cas=re.search(r'\s\d+-\d+-\d+\s',str(soup3))
           body3=soup3.find_all(class_="qtyInput")
           package=[]
        else:
           cas=http3[i]
           body3=soup3.find_all(class_="qtyInput")
           package=[]
#将价格以及规格储存
        for y in range(len(body3)):
                package.append([body3[y]['package'],body3[y]['price']])
        try:
               fin_dict[cas.group(0)]=package
        except:
               fin_dict[http3[i]]=[['0','0']]
#看下结果
fin_dict
#结果，键为cas编号，值以列表储存，列表中第一个元素为规格，第二个元素为价格
{' 62-51-1 ': [['25 mg', '154.00'],
  ['50 mg', '278.00'],
  ['100 mg', '500.00'],
  ['200 mg', '850.00'],
  ['1 mL * 10 mM (in DMSO)', '459.00']],
 ' 1210-83-9 ': [['5 mg', '336.00'],
  ['10 mg', '664.00'],
  ['25 mg', '1064.00'],
  ['50 mg', '1808.00'],
  ['100 mg', '3254.00'],
  ['200 mg', '5858.00']],
 ' 104054-27-5 ': [['5 mg', '369.00'],
  ['10 mg', '665.00'],
  ['25 mg', '1197.00'],
  ['50 mg', '1975.00'],
  ['100 mg', '3555.00'],
  ['200 mg', '6399.00'],
  ['1 mL * 10 mM (in DMSO)', '369.00']],
.....}
#输出文件为，csv格式，将fin_dict输出为out.csv
f=open('out.csv','w+')
for k,v in fin_dict.items():
    f.write(k+',')
    for i  in range(len(v)):
        f.write(v[i][0]+','+v[i][1]+',')
    f.write('\n')
f.close()