Add 'setup.py'

master
Dan Howe 7 years ago
parent 17ea25e8b5
commit f5ed362c8d

@ -1,93 +1 @@
import os from .pdfsearch import search_pdf
import re
import PyPDF2
from zlib import error as ZLibError
from PyPDF2.utils import PdfReadError
from tqdm import tqdm
def search_string(pattern, string, match_object, context_length):
"""Get details of SRE_Match object.
Args:
pattern: re search pattern
string: re search string
match_object: SRE_Match object
context_length: number of words to include before and after
Returns:
whole_word: word around pattern match, including prefix and suffix
context: words adjacent to pattern match
"""
idx_1 = match_object.start()
idx_2 = match_object.end()
# Get substrings before and after search result
str_before = string[:idx_1]
str_after = string[idx_2:]
# Get actual word match
str_match_prefix = string[:idx_1].split(' ')[-1:][0]
str_match_suffix = string[idx_2:].split(' ')[:1][0]
whole_word = str_match_prefix + pattern + str_match_suffix
# Get word in context
words_before = str_before.split(' ')[-1 - context_length:-1]
words_after = str_after.split(' ')[1:context_length + 1]
context = ' '.join(words_before + [whole_word] + words_after)
return whole_word, context
def search_pdf(pdf_name, search_patterns, context_length):
"""Search for text strings inside pdf.
Args:
pdf_name: path to pdf file
search_patterns: list of re search patterns
context_length: number of words to include before and after
Returns:
A list of dictionaries containing search results
"""
search_results = []
with open(pdf_name, 'rb') as pdf:
pdf_shortname = os.path.split(pdf_name)[1]
try:
reader = PyPDF2.PdfFileReader(pdf)
# Skip malformed pdfs
except PdfReadError as e:
print(e, pdf_shortname)
raise ValueError
num_pages = reader.getNumPages()
# Set up tqdm progress bar
pbar = tqdm(range(num_pages))
for i in pbar:
pbar.set_description('Searching {}'.format(pdf_shortname))
page_num = i + 1
page = reader.getPage(i)
try:
page_text = page.extractText().replace('\n', '')
except (KeyError, ZLibError):
continue
for pattern in search_patterns:
# Find matches on current page
matches = re.finditer(pattern.lower(), page_text.lower())
for match in matches:
whole_word, context = search_string(
pattern, page_text, match, context_length)
# Update search results
search_results.append({
'document': pdf_name,
'page': page_num,
'pattern': pattern,
'word': whole_word,
'context': context,
})
return search_results

@ -0,0 +1,9 @@
from setuptools import setup
setup(
name='pdfsearch',
version='0.1.0',
packages=['pdfsearch'],
author='Dan Howe',
author_email='d.howe@wrl.unsw.edu.au',
description='Search for text strings inside a pdf')
Loading…
Cancel
Save