Initial commit

8 years ago · 17ea25e8b5
commit 17ea25e8b5
1 changed files with 93 additions and 0 deletions
--- a/pdfsearch/init.py
+++ b/pdfsearch/init.py
@ -0,0 +1,93 @@
+import os
+import re
+import PyPDF2
+from zlib import error as ZLibError
+from PyPDF2.utils import PdfReadError
+from tqdm import tqdm
+
+
+def search_string(pattern, string, match_object, context_length):
+    """Get details of SRE_Match object.
+
+    Args:
+        pattern:        re search pattern
+        string:         re search string
+        match_object:   SRE_Match object
+        context_length: number of words to include before and after
+
+    Returns:
+        whole_word: word around pattern match, including prefix and suffix
+        context:    words adjacent to pattern match
+    """
+
+    idx_1 = match_object.start()
+    idx_2 = match_object.end()
+
+    # Get substrings before and after search result
+    str_before = string[:idx_1]
+    str_after = string[idx_2:]
+
+    # Get actual word match
+    str_match_prefix = string[:idx_1].split(' ')[-1:][0]
+    str_match_suffix = string[idx_2:].split(' ')[:1][0]
+    whole_word = str_match_prefix + pattern + str_match_suffix
+
+    # Get word in context
+    words_before = str_before.split(' ')[-1 - context_length:-1]
+    words_after = str_after.split(' ')[1:context_length + 1]
+    context = ' '.join(words_before + [whole_word] + words_after)
+
+    return whole_word, context
+
+
+def search_pdf(pdf_name, search_patterns, context_length):
+    """Search for text strings inside pdf.
+
+    Args:
+        pdf_name:        path to pdf file
+        search_patterns: list of re search patterns
+        context_length:  number of words to include before and after
+
+    Returns:
+        A list of dictionaries containing search results
+    """
+    
+    search_results = []
+    with open(pdf_name, 'rb') as pdf:
+        pdf_shortname = os.path.split(pdf_name)[1]
+        try:
+            reader = PyPDF2.PdfFileReader(pdf)
+        # Skip malformed pdfs
+        except PdfReadError as e:
+            print(e, pdf_shortname)
+            raise ValueError
+        num_pages = reader.getNumPages()
+
+        # Set up tqdm progress bar
+        pbar = tqdm(range(num_pages))
+        for i in pbar:
+            pbar.set_description('Searching {}'.format(pdf_shortname))
+            page_num = i + 1
+            page = reader.getPage(i)
+            try:
+                page_text = page.extractText().replace('\n', '')
+            except (KeyError, ZLibError):
+                continue
+
+            for pattern in search_patterns:
+                # Find matches on current page
+                matches = re.finditer(pattern.lower(), page_text.lower())
+                for match in matches:
+                    whole_word, context = search_string(
+                        pattern, page_text, match, context_length)
+
+                    # Update search results
+                    search_results.append({
+                        'document': pdf_name,
+                        'page': page_num,
+                        'pattern': pattern,
+                        'word': whole_word,
+                        'context': context,
+                    })
+
+    return search_results