diff --git a/pdfsearch/__init__.py b/pdfsearch/__init__.py index 8b23fc8..1bd0b09 100644 --- a/pdfsearch/__init__.py +++ b/pdfsearch/__init__.py @@ -1,93 +1 @@ -import os -import re -import PyPDF2 -from zlib import error as ZLibError -from PyPDF2.utils import PdfReadError -from tqdm import tqdm - - -def search_string(pattern, string, match_object, context_length): - """Get details of SRE_Match object. - - Args: - pattern: re search pattern - string: re search string - match_object: SRE_Match object - context_length: number of words to include before and after - - Returns: - whole_word: word around pattern match, including prefix and suffix - context: words adjacent to pattern match - """ - - idx_1 = match_object.start() - idx_2 = match_object.end() - - # Get substrings before and after search result - str_before = string[:idx_1] - str_after = string[idx_2:] - - # Get actual word match - str_match_prefix = string[:idx_1].split(' ')[-1:][0] - str_match_suffix = string[idx_2:].split(' ')[:1][0] - whole_word = str_match_prefix + pattern + str_match_suffix - - # Get word in context - words_before = str_before.split(' ')[-1 - context_length:-1] - words_after = str_after.split(' ')[1:context_length + 1] - context = ' '.join(words_before + [whole_word] + words_after) - - return whole_word, context - - -def search_pdf(pdf_name, search_patterns, context_length): - """Search for text strings inside pdf. - - Args: - pdf_name: path to pdf file - search_patterns: list of re search patterns - context_length: number of words to include before and after - - Returns: - A list of dictionaries containing search results - """ - - search_results = [] - with open(pdf_name, 'rb') as pdf: - pdf_shortname = os.path.split(pdf_name)[1] - try: - reader = PyPDF2.PdfFileReader(pdf) - # Skip malformed pdfs - except PdfReadError as e: - print(e, pdf_shortname) - raise ValueError - num_pages = reader.getNumPages() - - # Set up tqdm progress bar - pbar = tqdm(range(num_pages)) - for i in pbar: - pbar.set_description('Searching {}'.format(pdf_shortname)) - page_num = i + 1 - page = reader.getPage(i) - try: - page_text = page.extractText().replace('\n', '') - except (KeyError, ZLibError): - continue - - for pattern in search_patterns: - # Find matches on current page - matches = re.finditer(pattern.lower(), page_text.lower()) - for match in matches: - whole_word, context = search_string( - pattern, page_text, match, context_length) - - # Update search results - search_results.append({ - 'document': pdf_name, - 'page': page_num, - 'pattern': pattern, - 'word': whole_word, - 'context': context, - }) - - return search_results +from .pdfsearch import search_pdf diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..bafba09 --- /dev/null +++ b/setup.py @@ -0,0 +1,9 @@ +from setuptools import setup + +setup( + name='pdfsearch', + version='0.1.0', + packages=['pdfsearch'], + author='Dan Howe', + author_email='d.howe@wrl.unsw.edu.au', + description='Search for text strings inside a pdf')