import sys,os,re
import time
import shutil
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
#from selenium.webdriver.support import Thread
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.support import expected_conditions as EC
profile = webdriver.FirefoxProfile()
profile.set_preference('browser.download.dir', '/tmp/mozilla_baowenjuan0')
profile.set_preference('browser.download.folderList', 2)
profile.set_preference('browser.download.manager.showWhenStarting', False)
profile.set_preference('network.proxy.http','222.73.130.111')
profile.set_preference('network.proxy.http_port', 888)
profile.update_preferences()
profile.set_preference('browser.helperApps.neverAsk.saveToDisk', '/tmp/mozilla_baowenjuan0')
option = webdriver.FirefoxOptions()
option.add_argument('--headless') #要用无头的哦~
dr = webdriver.Firefox(options=option,firefox_profile=profile)
dr.get("https://www.cbioportal.org/patient?studyId=blca_bgi&caseId=B10")
我们提取Sample ID列里面的链接,就到另一个页面:
先要把所有点都显示出来,模拟点击“show more”,然后还要把复选框里的都选择了。再把表格里的内容爬下来。
def extract_mutations(link,dr_sub):
print ("checking this link:"+link)
dr_sub.get(link)
time.sleep(4)
count=8
while dr_sub.find_elements_by_xpath('//div/button[@type="button" and @id="showMoreButton"]')[0].is_enabled() and count>0:
dr_sub.find_elements_by_xpath('//div/button[@type="button" and @id="showMoreButton"]')[0].click()
time.sleep(1.5)
count-=1
#需要选择展示的列(坐标等信息)
dr_sub.find_elements_by_xpath('//div/button[@id="dropdown-custom-1"]')[0].click()
checkboxes=dr_sub.find_elements_by_xpath("//*[@type='checkbox']")
for checkbox in checkboxes:#选择所有的复选框
if not checkbox.is_selected():
checkbox.click()
trlist_sub=dr_sub.find_elements_by_xpath('//tbody/tr')#找到表格的每一行
for tr in trlist_sub:
list_sub=[]
tdlist=tr.find_elements_by_xpath('//td')
time.sleep(1)
for td in tdlist:
cont=td.text
list_sub.append(cont)
print([a for a in list_sub])
开始测试的时候不小心弄了一个死循环,结果直接被封IP:
urllib3.exceptions.MaxRetryError: HTTPConnectionPool(host='127.0.0.1', port=40283): Max retries exceeded with url: /session/0023ea81-f0c1-4cdc-a22b-4bfee9c75eb4/element (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f5f090e7b70>: Failed to establish a new connection: [Errno 111] Connection refused',))
于是换IP,就是前面配置的那两行:(实际循环跑需要不断换IP,要不然会断,反爬好敏感啊~)
profile.set_preference('network.proxy.http','222.73.130.111')
profile.set_preference('network.proxy.http_port', 888)
拿一个测试的:
def main():
dr = webdriver.Firefox(options=option,firefox_profile=profile)
main_link='https://www.cbioportal.org/study/clinicalData?id=blca_bgi'
dr.get(main_link)
time.sleep(10)
trlist=dr.find_elements_by_xpath('//tbody/tr')
for tr in trlist:
content=[]
tdlist=tr.find_elements_by_tag_name('td')
count=0
for td in tdlist:
if td.find_element_by_xpath('//td/a[@target="_blank"]') and count<2:
link_1=td.find_element_by_xpath('//td/a[@target="_blank"]').get_attribute('href')
if count==1:extract_mutations(link_1,dr)
```后面的不用看,本想把父页面也打印的,但发现子页面有这个信息
sample_ID=td.text
content.append(link_1)
content.append(sample_ID)
count+=1
```
```
else:
content_tmp=td.text
content.append(content_tmp)
```
#print ([a for a in content])
if __name__ == '__main__':
main()
结果直接打印了,可以作为脚本跑,输出到文件。