From 17ea25e8b595762bf00dc236123261a2e0a8969f Mon Sep 17 00:00:00 2001 From: Dan Howe Date: Wed, 11 Apr 2018 09:01:43 +1000 Subject: [PATCH] Initial commit --- pdfsearch/__init__.py | 93 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 pdfsearch/__init__.py diff --git a/pdfsearch/__init__.py b/pdfsearch/__init__.py new file mode 100644 index 0000000..8b23fc8 --- /dev/null +++ b/pdfsearch/__init__.py @@ -0,0 +1,93 @@ +import os +import re +import PyPDF2 +from zlib import error as ZLibError +from PyPDF2.utils import PdfReadError +from tqdm import tqdm + + +def search_string(pattern, string, match_object, context_length): + """Get details of SRE_Match object. + + Args: + pattern: re search pattern + string: re search string + match_object: SRE_Match object + context_length: number of words to include before and after + + Returns: + whole_word: word around pattern match, including prefix and suffix + context: words adjacent to pattern match + """ + + idx_1 = match_object.start() + idx_2 = match_object.end() + + # Get substrings before and after search result + str_before = string[:idx_1] + str_after = string[idx_2:] + + # Get actual word match + str_match_prefix = string[:idx_1].split(' ')[-1:][0] + str_match_suffix = string[idx_2:].split(' ')[:1][0] + whole_word = str_match_prefix + pattern + str_match_suffix + + # Get word in context + words_before = str_before.split(' ')[-1 - context_length:-1] + words_after = str_after.split(' ')[1:context_length + 1] + context = ' '.join(words_before + [whole_word] + words_after) + + return whole_word, context + + +def search_pdf(pdf_name, search_patterns, context_length): + """Search for text strings inside pdf. + + Args: + pdf_name: path to pdf file + search_patterns: list of re search patterns + context_length: number of words to include before and after + + Returns: + A list of dictionaries containing search results + """ + + search_results = [] + with open(pdf_name, 'rb') as pdf: + pdf_shortname = os.path.split(pdf_name)[1] + try: + reader = PyPDF2.PdfFileReader(pdf) + # Skip malformed pdfs + except PdfReadError as e: + print(e, pdf_shortname) + raise ValueError + num_pages = reader.getNumPages() + + # Set up tqdm progress bar + pbar = tqdm(range(num_pages)) + for i in pbar: + pbar.set_description('Searching {}'.format(pdf_shortname)) + page_num = i + 1 + page = reader.getPage(i) + try: + page_text = page.extractText().replace('\n', '') + except (KeyError, ZLibError): + continue + + for pattern in search_patterns: + # Find matches on current page + matches = re.finditer(pattern.lower(), page_text.lower()) + for match in matches: + whole_word, context = search_string( + pattern, page_text, match, context_length) + + # Update search results + search_results.append({ + 'document': pdf_name, + 'page': page_num, + 'pattern': pattern, + 'word': whole_word, + 'context': context, + }) + + return search_results