目的:下载好的一篇pdf英文文献,获取全部参考文献,并提取pmid列表。
import sys
sys.setdefaultencoding("utf-8")
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams, LTTextBox, LTTextLine, LTFigure, LTImage
from pdfminer.converter import PDFPageAggregator
import re
打开pdf文件
path=' 文件所在路径'
fp = open(path, 'rb')
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
# Supply the password for initialization.
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
document = PDFDocument(parser)
Process each page contained in the document
新建列表,读取的每行内容放进去,去除空行号
text_content = []
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
layout = device.get_result()
for lt_obj in layout:
if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
text_content.append(lt_obj.get_text())
else:
pass
print(text_content)
## text_content 中每一个元素存储了一行文字
total_text = ''.join(text_content).replace("\n","")
参考文献已写入文本中
另外还可采用正则表达式筛选目的参考文献,
(这一部分正则表达式不是很熟)
file = open("the file your want to save","w")
p = re.compile('^[0-9]+\..*[0-9]+\.')
#[0-9]+\.\s[A-Z]+.*\.\s[0-9]{4};.*[0-9]+\.
m = p.findall(total_text)
for i in m:
#print i
if i.startswith("["):
file.write(str(i))
file.write("\n")
file.close()