Source code for pdf_util
import pybtex.database
from pylatexenc.latex2text import LatexNodes2Text # to unescape latex
from pathlib import Path
import re
import PyPDF2
from difflib import SequenceMatcher
[docs]def get_title_info(entry):
"""Try to guess title information from a publication."""
if isinstance(entry, Path):
pdf = PyPDF2.PdfFileReader(str(entry))
if '/Title' in pdf.documentInfo:
return pdf.documentInfo['/Title']
else:
return None
elif type(entry) is pybtex.database.Entry:
if 'title' in entry.fields:
return LatexNodes2Text().latex_to_text(entry.fields['title'])
elif 'booktitle' in entry.fields:
return LatexNodes2Text().latex_to_text(entry.fields['booktitle'])
else:
return None
else:
raise NotImplementedError("Can only handle pdf or bib objects (was %s)"
% type(entry))
[docs]def get_author_info(entry):
"""Try to extract some form of author string from a bibitem
:entry: BibliographyData
:returns: str -- String with author info or None, if nothing found
"""
if isinstance(entry, Path):
pdf = PyPDF2.PdfFileReader(str(entry))
if '/Author' in pdf.documentInfo:
return pdf.documentInfo['/Author']
else:
# last straw: filename
return entry.stem
elif type(entry) is pybtex.database.Entry:
author1 = ''
author2 = ''
author3 = ''
if 'author' in entry.fields:
author1 = str(entry.fields['author'])
if 'authors' in entry.fields:
author2 = entry.fields['authors']
if 'author' in str(entry.persons):
persons = [' '.join(p.first_names + p.middle_names + p.last_names)
for p in entry.persons['author']]
author3 = ' '.join(persons)
if 'editor' in str(entry.persons):
persons = [' '.join(p.first_names + p.middle_names + p.last_names)
for p in entry.persons['editor']]
author3 = ' '.join(persons)
author_info = LatexNodes2Text().latex_to_text(' '.join((author1,
author2,
author3)))
if author_info:
return author_info.strip()
else:
return None
else:
raise NotImplementedError("Can only handle pdf or bib objects (was %s)"
% type(entry))
[docs]def pdf_for_pub(entry, pdf_folder):
"""Attempt to guess which pdf file might belong to a given bibliography entry.
:entry: BibliographyData object
:pdf_folder: str denoting folder location
:returns: str
"""
pdf_path = Path(pdf_folder)
if not pdf_path.exists():
raise OSError('Folder %s does not exist.' % pdf_folder)
if not pdf_path.is_dir():
raise OSError('%s is not a folder.' % pdf_folder)
else:
match = None
confidence = 0
for file in pdf_path.iterdir():
author_info = get_author_info(entry)
title_info = get_title_info(entry)
author_info_pdf = get_author_info(file)
confidence_title = search_for_string(file, title_info)
confidence_author = SequenceMatcher(None, author_info_pdf.split(),
author_info.split(),
autojunk=False).ratio()
new_confidence = (3 * confidence_title + confidence_author) / 4
if confidence < new_confidence:
confidence = new_confidence
match = file
if match and confidence > 0.5:
if 'title' in entry.fields:
title = entry.fields['title']
elif 'booktitle' in entry.fields:
title = entry.fields['booktitle']
else:
title = '<no title>'
print('Match for %s: %s, score=%f' % (title, match, confidence))
return match
[docs]def search_for_string(pdf, text):
"""Scan a pdf file for a string and return best match + score.
:pdf: PyPDF2.PdfFileReader -- file containing the pdf
:text: str -- String to search for
:returns: TODO
"""
if not isinstance(pdf, PyPDF2.PdfFileReader):
pdf = PyPDF2.PdfFileReader(str(pdf))
if not isinstance(text, list):
text = text.split()
text_set = set(text)
pdf_text = ' '.join([page.extractText() for page in pdf.pages])
# remove newlines and superfluous spaces
pdf_text = re.sub('[\n\s]+', ' ', pdf_text).split()
# title should occur in first fifth (hopefully)
pdf_text = pdf_text[:len(pdf_text) // 5]
best_score = 0
best_match = None
print("Searching for: %s" % text)
for start in range(0, len(pdf_text) - len(text)):
subsequence = pdf_text[start:start + len(text)]
if len(text_set.intersection(set(subsequence))) < 2:
continue
score = SequenceMatcher(None, subsequence, text, autojunk=False).ratio()
if score > best_score:
best_score = score
best_match = subsequence
if best_match:
print("Best match for\n%s:\n%s\nscore=%f" % (text, best_match,
best_score))
return best_score