Add 'pdfsearch.py'
parent
f5ed362c8d
commit
00d7205e9e
@ -0,0 +1,97 @@
|
|||||||
|
import os
|
||||||
|
import re
|
||||||
|
import PyPDF2
|
||||||
|
from PyPDF2.utils import PdfReadError
|
||||||
|
from zlib import error as ZLibError
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
|
def search_string(pattern, string, match_object, context_length):
|
||||||
|
"""Get details of SRE_Match object.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pattern: re search pattern
|
||||||
|
string: re search string
|
||||||
|
match_object: SRE_Match object
|
||||||
|
context_length: number of words to include before and after
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
whole_word: word around pattern match, including prefix and suffix
|
||||||
|
context: words adjacent to pattern match
|
||||||
|
"""
|
||||||
|
|
||||||
|
idx_1 = match_object.start()
|
||||||
|
idx_2 = match_object.end()
|
||||||
|
|
||||||
|
# Get substrings before and after search result
|
||||||
|
str_before = string[:idx_1]
|
||||||
|
str_after = string[idx_2:]
|
||||||
|
|
||||||
|
# Get actual word match
|
||||||
|
str_match_prefix = string[:idx_1].split(' ')[-1:][0]
|
||||||
|
str_match_suffix = string[idx_2:].split(' ')[:1][0]
|
||||||
|
whole_word = str_match_prefix + pattern + str_match_suffix
|
||||||
|
|
||||||
|
# Get word in context
|
||||||
|
words_before = str_before.split(' ')[-1 - context_length:-1]
|
||||||
|
words_after = str_after.split(' ')[1:context_length + 1]
|
||||||
|
context = ' '.join(words_before + [whole_word] + words_after)
|
||||||
|
|
||||||
|
return whole_word, context
|
||||||
|
|
||||||
|
|
||||||
|
def search_pdf(pdf_name, search_patterns, context_length):
|
||||||
|
"""Search for text strings inside a pdf.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pdf_name: path to pdf file
|
||||||
|
search_patterns: list of re search patterns
|
||||||
|
context_length: number of words to include before and after
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A list of dictionaries containing search results
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Convert search patterns to list, if a single search string is provided
|
||||||
|
if isinstance(search_patterns, str):
|
||||||
|
search_patterns = [search_patterns]
|
||||||
|
|
||||||
|
search_results = []
|
||||||
|
with open(pdf_name, 'rb') as pdf:
|
||||||
|
pdf_shortname = os.path.split(pdf_name)[1]
|
||||||
|
reader = PyPDF2.PdfFileReader(pdf)
|
||||||
|
num_pages = reader.getNumPages()
|
||||||
|
|
||||||
|
# Set up tqdm progress bar
|
||||||
|
pbar = tqdm(range(num_pages))
|
||||||
|
for i in pbar:
|
||||||
|
pbar.set_description('Searching {}'.format(pdf_shortname))
|
||||||
|
page_num = i + 1
|
||||||
|
page = reader.getPage(i)
|
||||||
|
try:
|
||||||
|
page_text = page.extractText().replace('\n', '')
|
||||||
|
# Skip page if text cannot be read
|
||||||
|
except (KeyError, ZLibError):
|
||||||
|
continue
|
||||||
|
|
||||||
|
for pattern in search_patterns:
|
||||||
|
if pattern == pattern.lower():
|
||||||
|
# Find case-insensitive matches if pattern is all lowercase
|
||||||
|
matches = re.finditer(pattern, page_text.lower())
|
||||||
|
else:
|
||||||
|
matches = re.finditer(pattern, page_text)
|
||||||
|
|
||||||
|
for match in matches:
|
||||||
|
whole_word, context = search_string(
|
||||||
|
pattern, page_text, match, context_length)
|
||||||
|
|
||||||
|
# Update search results
|
||||||
|
search_results.append({
|
||||||
|
'document': pdf_name,
|
||||||
|
'page': page_num,
|
||||||
|
'pattern': pattern,
|
||||||
|
'word': whole_word,
|
||||||
|
'context': context,
|
||||||
|
})
|
||||||
|
|
||||||
|
return search_results
|
Loading…
Reference in New Issue