Add 'setup.py'

8 years ago · f5ed362c8d
parent 17ea25e8b5
commit f5ed362c8d
2 changed files with 10 additions and 93 deletions
--- a/pdfsearch/init.py
+++ b/pdfsearch/init.py
@ -1,93 +1 @@
-import os
+from .pdfsearch import search_pdf
 import re
 import PyPDF2
 from zlib import error as ZLibError
 from PyPDF2.utils import PdfReadError
 from tqdm import tqdm
 def search_string(pattern, string, match_object, context_length):
    """Get details of SRE_Match object.
    Args:
        pattern:        re search pattern
        string:         re search string
        match_object:   SRE_Match object
        context_length: number of words to include before and after
    Returns:
        whole_word: word around pattern match, including prefix and suffix
        context:    words adjacent to pattern match
    """
    idx_1 = match_object.start()
    idx_2 = match_object.end()
    # Get substrings before and after search result
    str_before = string[:idx_1]
    str_after = string[idx_2:]
    # Get actual word match
    str_match_prefix = string[:idx_1].split(' ')[-1:][0]
    str_match_suffix = string[idx_2:].split(' ')[:1][0]
    whole_word = str_match_prefix + pattern + str_match_suffix
    # Get word in context
    words_before = str_before.split(' ')[-1 - context_length:-1]
    words_after = str_after.split(' ')[1:context_length + 1]
    context = ' '.join(words_before + [whole_word] + words_after)
    return whole_word, context
 def search_pdf(pdf_name, search_patterns, context_length):
    """Search for text strings inside pdf.
    Args:
        pdf_name:        path to pdf file
        search_patterns: list of re search patterns
        context_length:  number of words to include before and after
    Returns:
        A list of dictionaries containing search results
    """
    search_results = []
    with open(pdf_name, 'rb') as pdf:
        pdf_shortname = os.path.split(pdf_name)[1]
        try:
            reader = PyPDF2.PdfFileReader(pdf)
        # Skip malformed pdfs
        except PdfReadError as e:
            print(e, pdf_shortname)
            raise ValueError
        num_pages = reader.getNumPages()
        # Set up tqdm progress bar
        pbar = tqdm(range(num_pages))
        for i in pbar:
            pbar.set_description('Searching {}'.format(pdf_shortname))
            page_num = i + 1
            page = reader.getPage(i)
            try:
                page_text = page.extractText().replace('\n', '')
            except (KeyError, ZLibError):
                continue
            for pattern in search_patterns:
                # Find matches on current page
                matches = re.finditer(pattern.lower(), page_text.lower())
                for match in matches:
                    whole_word, context = search_string(
                        pattern, page_text, match, context_length)
                    # Update search results
                    search_results.append({
                        'document': pdf_name,
                        'page': page_num,
                        'pattern': pattern,
                        'word': whole_word,
                        'context': context,
                    })
    return search_results
--- a/setup.py
+++ b/setup.py
@ -0,0 +1,9 @@
 from setuptools import setup
 setup(
    name='pdfsearch',
    version='0.1.0',
    packages=['pdfsearch'],
    author='Dan Howe',
    author_email='d.howe@wrl.unsw.edu.au',
    description='Search for text strings inside a pdf')