Source code for pdf_util

import pybtex.database
from pylatexenc.latex2text import LatexNodes2Text  # to unescape latex
from pathlib import Path
import re
import PyPDF2
from difflib import SequenceMatcher


[docs]def get_title_info(entry): """Try to guess title information from a publication.""" if isinstance(entry, Path): pdf = PyPDF2.PdfFileReader(str(entry)) if '/Title' in pdf.documentInfo: return pdf.documentInfo['/Title'] else: return None elif type(entry) is pybtex.database.Entry: if 'title' in entry.fields: return LatexNodes2Text().latex_to_text(entry.fields['title']) elif 'booktitle' in entry.fields: return LatexNodes2Text().latex_to_text(entry.fields['booktitle']) else: return None else: raise NotImplementedError("Can only handle pdf or bib objects (was %s)" % type(entry))
[docs]def get_author_info(entry): """Try to extract some form of author string from a bibitem :entry: BibliographyData :returns: str -- String with author info or None, if nothing found """ if isinstance(entry, Path): pdf = PyPDF2.PdfFileReader(str(entry)) if '/Author' in pdf.documentInfo: return pdf.documentInfo['/Author'] else: # last straw: filename return entry.stem elif type(entry) is pybtex.database.Entry: author1 = '' author2 = '' author3 = '' if 'author' in entry.fields: author1 = str(entry.fields['author']) if 'authors' in entry.fields: author2 = entry.fields['authors'] if 'author' in str(entry.persons): persons = [' '.join(p.first_names + p.middle_names + p.last_names) for p in entry.persons['author']] author3 = ' '.join(persons) if 'editor' in str(entry.persons): persons = [' '.join(p.first_names + p.middle_names + p.last_names) for p in entry.persons['editor']] author3 = ' '.join(persons) author_info = LatexNodes2Text().latex_to_text(' '.join((author1, author2, author3))) if author_info: return author_info.strip() else: return None else: raise NotImplementedError("Can only handle pdf or bib objects (was %s)" % type(entry))
[docs]def pdf_for_pub(entry, pdf_folder): """Attempt to guess which pdf file might belong to a given bibliography entry. :entry: BibliographyData object :pdf_folder: str denoting folder location :returns: str """ pdf_path = Path(pdf_folder) if not pdf_path.exists(): raise OSError('Folder %s does not exist.' % pdf_folder) if not pdf_path.is_dir(): raise OSError('%s is not a folder.' % pdf_folder) else: match = None confidence = 0 for file in pdf_path.iterdir(): author_info = get_author_info(entry) title_info = get_title_info(entry) author_info_pdf = get_author_info(file) confidence_title = search_for_string(file, title_info) confidence_author = SequenceMatcher(None, author_info_pdf.split(), author_info.split(), autojunk=False).ratio() new_confidence = (3 * confidence_title + confidence_author) / 4 if confidence < new_confidence: confidence = new_confidence match = file if match and confidence > 0.5: if 'title' in entry.fields: title = entry.fields['title'] elif 'booktitle' in entry.fields: title = entry.fields['booktitle'] else: title = '<no title>' print('Match for %s: %s, score=%f' % (title, match, confidence)) return match
[docs]def search_for_string(pdf, text): """Scan a pdf file for a string and return best match + score. :pdf: PyPDF2.PdfFileReader -- file containing the pdf :text: str -- String to search for :returns: TODO """ if not isinstance(pdf, PyPDF2.PdfFileReader): pdf = PyPDF2.PdfFileReader(str(pdf)) if not isinstance(text, list): text = text.split() text_set = set(text) pdf_text = ' '.join([page.extractText() for page in pdf.pages]) # remove newlines and superfluous spaces pdf_text = re.sub('[\n\s]+', ' ', pdf_text).split() # title should occur in first fifth (hopefully) pdf_text = pdf_text[:len(pdf_text) // 5] best_score = 0 best_match = None print("Searching for: %s" % text) for start in range(0, len(pdf_text) - len(text)): subsequence = pdf_text[start:start + len(text)] if len(text_set.intersection(set(subsequence))) < 2: continue score = SequenceMatcher(None, subsequence, text, autojunk=False).ratio() if score > best_score: best_score = score best_match = subsequence if best_match: print("Best match for\n%s:\n%s\nscore=%f" % (text, best_match, best_score)) return best_score