定义:
网络爬虫(又被称为网页蜘蛛,是一种按照一定的规则,自动地抓取万维网信息的程序或者脚本。(来源于百度,懒得写定义)
目的:
爬取陶素官网药物的售价信息
https://tsbiochem.com/alltargets
工具
python的爬虫工具很多,这次使用Beautiful Soup为例,简单,快速。
步骤:
#安装
conda install -c conda-forge beautifulsoup4
conda install requests
#导入包
import os, time, random,sys,re
import requests
import bs4
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
#requests获取网页,BeautifulSoup获取网页状态
response = requests.get('https://tsbiochem.com/alltargets')
soup = BeautifulSoup(response.content, "html.parser")
soup.prettify
#显示
<bound method Tag.prettify of <!DOCTYPE html>
<html>
<head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="IE=Edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no" name="viewport"/>
<meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
<meta content="true" name="HandheldFriendly"/>
<link href="/css/bootstrap.min.css" media="all" rel="stylesheet" type="text/css"/>
<link href="/images/ticontransparent.png" rel="icon"/>
<link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css" rel="stylesheet"/>
<link href="/css/tsweb_base.css?ver=1.3" rel="stylesheet" type="text/css">
<link href="/css/tsweb_layout.css?ver=1.35" rel="stylesheet" type="text/css">
<link href="/css/tsweb_shopcart.css" rel="stylesheet" type="text/css">
<link href="/css/tsweb_orderway.css" rel="stylesheet" type="text/css"/>
<link href="/css/tsweb_compound.css" rel="stylesheet" type="text/css"/>
<link href="/css/tsweb_library.css" rel="stylesheet" type="text/css"/>
<link href="/css/tsweb_alltargets.css" rel="stylesheet" type="text/css"/>
<link href="/css/tsweb_pathway.css" rel="stylesheet" type="text/css"/>
<link href="/css/tsweb_search.css" rel="stylesheet" type="text/css"/>
<link href="/css/tsweb_target.css" rel="stylesheet" type="text/css"/>
<link href="/css/tsweb_contact.css" rel="stylesheet" type="text/css"/>
<link href="/css/tsweb_about.css" rel="stylesheet" type="text/css"/>
<link href="/css/tsweb_partner.css" rel="stylesheet" type="text/css"/>
<link href="/css/tsweb_calculator.css" rel="stylesheet" type="text/css"/>
<link href="/css/tsweb_news.css" rel="stylesheet" type="text/css"/>
<link href="/css/tsweb_join_us.css" rel="stylesheet" type="text/css"/>
<link href="/css/tsweb_faq.css" rel="stylesheet" type="text/css"/>
<link href="/css/tsweb_alllibraries.css" rel="stylesheet" type="text/css"/>
<script src="/js/jquery-3.1.1.min.js"></script>
<script src="/js/bootstrap.min.js"></script>
<script src="/js/tsweb_base.js"></script>
<meta content="dtb30YDZRhiTZGGnIbEL36fKUTMGnLcU7DTY1Sdd" name="csrf-token"/>
<title>通路靶点 | 陶素生化</title>
#显示我们需要的内容,就是提取属性为’content_a_en target_a‘的内容,获取其网址链接
body=soup.find_all(class_="content_a_en target_a")
http1=[x['href'] for x in body]
http1
#结果
'https://tsbiochem.com/target/Salt-Inducible%20Kinase',
'https://tsbiochem.com/target/Serine-Protease',
'https://tsbiochem.com/target/Serine-threonin-kinase',
'https://tsbiochem.com/target/SGK',
'https://tsbiochem.com/target/SGLT',
'https://tsbiochem.com/target/Sigma-receptor',
'https://tsbiochem.com/target/Sirtuin',
'https://tsbiochem.com/target/Sodium-Channel',
'https://tsbiochem.com/target/Somatostatin',
'https://tsbiochem.com/target/Src',
'https://tsbiochem.com/target/STAT',
'https://tsbiochem.com/target/Survivin',
'https://tsbiochem.com/target/Syk',
'https://tsbiochem.com/target/TAM-Receptor',
'https://tsbiochem.com/target/Telomerase',
'https://tsbiochem.com/target/TGF-beta-Smad',
'https://tsbiochem.com/target/Thioredoxin',
'https://tsbiochem.com/target/Thrombin',
#转换接头(UserAgent),防止反爬虫,
http2=[]
for i in range(len(http1)):
ua=UserAgent()
headers={"User-Agent":ua.random}
response1 = requests.get(http1[i],headers=headers)
soup1 = BeautifulSoup(response1.content, "html.parser")
body1=soup1.find_all('td')
#发现所有‘td’的标签,储存为body1
for x in range(len(body1)):
try:
c=body1[x].find('a')
#找寻body1中所有为‘a’的标签,存储为c
http2.append(c['href'])
#找寻c中‘href’的属性,储存静茹列表http2
except:
True
#将http2转化为一个无序,无重复的列表
http3=list(set(http2))
#看下我们都做了什么,我这里直接展示最后的结果,如果你想看看每一步都发生了什么,取http1中的一个元素测试一下就好
http3[1:10]
#结果
['https://tsbiochem.com/compound/N-Acetyl-5-hydroxytryptamine',
'https://tsbiochem.com/compound/Atipamezole',
'https://tsbiochem.com/compound/Zacopride%20hydrochloride',
'https://tsbiochem.com/compound/GTS-21-dihydrochloride',
'https://tsbiochem.com/compound/Carvedilol',
'https://tsbiochem.com/compound/Isoetharine%20mesylate%20salt',
'https://tsbiochem.com/compound/Sotalol-hydrochloride',
'https://tsbiochem.com/compound/Urapidil-hydrochloride',
'https://tsbiochem.com/compound/Pirenzepine-hydrochloride']
#获取价格以及规格信息,使用字典格式储存
fin_dict={}
#获取网页
for i in range(len(http3)):
ua=UserAgent()
headers={"User-Agent":ua.random}
response3 = requests.get(http3[i],headers=headers)
soup3 = BeautifulSoup(response3.content, "html.parser")
#获取CAS编号
if re.search(r'\s\d+-\d+-\d+\s',str(soup3)):
cas=re.search(r'\s\d+-\d+-\d+\s',str(soup3))
body3=soup3.find_all(class_="qtyInput")
package=[]
else:
cas=http3[i]
body3=soup3.find_all(class_="qtyInput")
package=[]
#将价格以及规格储存
for y in range(len(body3)):
package.append([body3[y]['package'],body3[y]['price']])
try:
fin_dict[cas.group(0)]=package
except:
fin_dict[http3[i]]=[['0','0']]
#看下结果
fin_dict
#结果,键为cas编号,值以列表储存,列表中第一个元素为规格,第二个元素为价格
{' 62-51-1 ': [['25 mg', '154.00'],
['50 mg', '278.00'],
['100 mg', '500.00'],
['200 mg', '850.00'],
['1 mL * 10 mM (in DMSO)', '459.00']],
' 1210-83-9 ': [['5 mg', '336.00'],
['10 mg', '664.00'],
['25 mg', '1064.00'],
['50 mg', '1808.00'],
['100 mg', '3254.00'],
['200 mg', '5858.00']],
' 104054-27-5 ': [['5 mg', '369.00'],
['10 mg', '665.00'],
['25 mg', '1197.00'],
['50 mg', '1975.00'],
['100 mg', '3555.00'],
['200 mg', '6399.00'],
['1 mL * 10 mM (in DMSO)', '369.00']],
.....}
#输出文件为,csv格式,将fin_dict输出为out.csv
f=open('out.csv','w+')
for k,v in fin_dict.items():
f.write(k+',')
for i in range(len(v)):
f.write(v[i][0]+','+v[i][1]+',')
f.write('\n')
f.close()