diff --git a/pdfsearch/pdfsearch.py b/pdfsearch/pdfsearch.py new file mode 100644 index 0000000..1c7d403 --- /dev/null +++ b/pdfsearch/pdfsearch.py @@ -0,0 +1,97 @@ +import os +import re +import PyPDF2 +from PyPDF2.utils import PdfReadError +from zlib import error as ZLibError +from tqdm import tqdm + + +def search_string(pattern, string, match_object, context_length): + """Get details of SRE_Match object. + + Args: + pattern: re search pattern + string: re search string + match_object: SRE_Match object + context_length: number of words to include before and after + + Returns: + whole_word: word around pattern match, including prefix and suffix + context: words adjacent to pattern match + """ + + idx_1 = match_object.start() + idx_2 = match_object.end() + + # Get substrings before and after search result + str_before = string[:idx_1] + str_after = string[idx_2:] + + # Get actual word match + str_match_prefix = string[:idx_1].split(' ')[-1:][0] + str_match_suffix = string[idx_2:].split(' ')[:1][0] + whole_word = str_match_prefix + pattern + str_match_suffix + + # Get word in context + words_before = str_before.split(' ')[-1 - context_length:-1] + words_after = str_after.split(' ')[1:context_length + 1] + context = ' '.join(words_before + [whole_word] + words_after) + + return whole_word, context + + +def search_pdf(pdf_name, search_patterns, context_length): + """Search for text strings inside a pdf. + + Args: + pdf_name: path to pdf file + search_patterns: list of re search patterns + context_length: number of words to include before and after + + Returns: + A list of dictionaries containing search results + """ + + # Convert search patterns to list, if a single search string is provided + if isinstance(search_patterns, str): + search_patterns = [search_patterns] + + search_results = [] + with open(pdf_name, 'rb') as pdf: + pdf_shortname = os.path.split(pdf_name)[1] + reader = PyPDF2.PdfFileReader(pdf) + num_pages = reader.getNumPages() + + # Set up tqdm progress bar + pbar = tqdm(range(num_pages)) + for i in pbar: + pbar.set_description('Searching {}'.format(pdf_shortname)) + page_num = i + 1 + page = reader.getPage(i) + try: + page_text = page.extractText().replace('\n', '') + # Skip page if text cannot be read + except (KeyError, ZLibError): + continue + + for pattern in search_patterns: + if pattern == pattern.lower(): + # Find case-insensitive matches if pattern is all lowercase + matches = re.finditer(pattern, page_text.lower()) + else: + matches = re.finditer(pattern, page_text) + + for match in matches: + whole_word, context = search_string( + pattern, page_text, match, context_length) + + # Update search results + search_results.append({ + 'document': pdf_name, + 'page': page_num, + 'pattern': pattern, + 'word': whole_word, + 'context': context, + }) + + return search_results