From f5ed362c8df0eba297a5e043855acf2b7dd61fa0 Mon Sep 17 00:00:00 2001
From: Dan Howe <d.howe@wrl.unsw.edu.au>
Date: Wed, 11 Apr 2018 10:12:05 +1000
Subject: [PATCH] Add 'setup.py'

---
 pdfsearch/__init__.py | 94 +------------------------------------------
 setup.py              |  9 +++++
 2 files changed, 10 insertions(+), 93 deletions(-)
 create mode 100644 setup.py

diff --git a/pdfsearch/__init__.py b/pdfsearch/__init__.py
index 8b23fc8..1bd0b09 100644
--- a/pdfsearch/__init__.py
+++ b/pdfsearch/__init__.py
@@ -1,93 +1 @@
-import os
-import re
-import PyPDF2
-from zlib import error as ZLibError
-from PyPDF2.utils import PdfReadError
-from tqdm import tqdm
-
-
-def search_string(pattern, string, match_object, context_length):
-    """Get details of SRE_Match object.
-
-    Args:
-        pattern:        re search pattern
-        string:         re search string
-        match_object:   SRE_Match object
-        context_length: number of words to include before and after
-
-    Returns:
-        whole_word: word around pattern match, including prefix and suffix
-        context:    words adjacent to pattern match
-    """
-
-    idx_1 = match_object.start()
-    idx_2 = match_object.end()
-
-    # Get substrings before and after search result
-    str_before = string[:idx_1]
-    str_after = string[idx_2:]
-
-    # Get actual word match
-    str_match_prefix = string[:idx_1].split(' ')[-1:][0]
-    str_match_suffix = string[idx_2:].split(' ')[:1][0]
-    whole_word = str_match_prefix + pattern + str_match_suffix
-
-    # Get word in context
-    words_before = str_before.split(' ')[-1 - context_length:-1]
-    words_after = str_after.split(' ')[1:context_length + 1]
-    context = ' '.join(words_before + [whole_word] + words_after)
-
-    return whole_word, context
-
-
-def search_pdf(pdf_name, search_patterns, context_length):
-    """Search for text strings inside pdf.
-
-    Args:
-        pdf_name:        path to pdf file
-        search_patterns: list of re search patterns
-        context_length:  number of words to include before and after
-
-    Returns:
-        A list of dictionaries containing search results
-    """
-    
-    search_results = []
-    with open(pdf_name, 'rb') as pdf:
-        pdf_shortname = os.path.split(pdf_name)[1]
-        try:
-            reader = PyPDF2.PdfFileReader(pdf)
-        # Skip malformed pdfs
-        except PdfReadError as e:
-            print(e, pdf_shortname)
-            raise ValueError
-        num_pages = reader.getNumPages()
-
-        # Set up tqdm progress bar
-        pbar = tqdm(range(num_pages))
-        for i in pbar:
-            pbar.set_description('Searching {}'.format(pdf_shortname))
-            page_num = i + 1
-            page = reader.getPage(i)
-            try:
-                page_text = page.extractText().replace('\n', '')
-            except (KeyError, ZLibError):
-                continue
-
-            for pattern in search_patterns:
-                # Find matches on current page
-                matches = re.finditer(pattern.lower(), page_text.lower())
-                for match in matches:
-                    whole_word, context = search_string(
-                        pattern, page_text, match, context_length)
-
-                    # Update search results
-                    search_results.append({
-                        'document': pdf_name,
-                        'page': page_num,
-                        'pattern': pattern,
-                        'word': whole_word,
-                        'context': context,
-                    })
-
-    return search_results
+from .pdfsearch import search_pdf
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..bafba09
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,9 @@
+from setuptools import setup
+
+setup(
+    name='pdfsearch',
+    version='0.1.0',
+    packages=['pdfsearch'],
+    author='Dan Howe',
+    author_email='d.howe@wrl.unsw.edu.au',
+    description='Search for text strings inside a pdf')