|
|
|
@ -1,11 +1,86 @@
|
|
|
|
|
import os
|
|
|
|
|
import re
|
|
|
|
|
import PyPDF2
|
|
|
|
|
from PyPDF2.utils import PdfReadError
|
|
|
|
|
from zlib import error as ZLibError
|
|
|
|
|
import subprocess
|
|
|
|
|
from tqdm import tqdm
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def parse_cli_args(kwargs):
|
|
|
|
|
args = []
|
|
|
|
|
for opt, val in kwargs.items():
|
|
|
|
|
# Get option name
|
|
|
|
|
if val:
|
|
|
|
|
args.append('-' + opt)
|
|
|
|
|
if val is not True:
|
|
|
|
|
# Include value if option is not a boolean switch
|
|
|
|
|
args.append(str(val))
|
|
|
|
|
return args
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def pdfinfo(pdf_file, **kwargs):
|
|
|
|
|
"""Call pdfinfo (Poppler) with optional keyword arguments.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
pdf_file: path to pdf file
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
Dictionary containing pdf info
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
# Parse command line arguments
|
|
|
|
|
args = parse_cli_args(kwargs)
|
|
|
|
|
|
|
|
|
|
# Collect command line arguments
|
|
|
|
|
command_str = ['pdfinfo', '-isodates', *args, pdf_file]
|
|
|
|
|
|
|
|
|
|
result = subprocess.Popen(
|
|
|
|
|
command_str, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=1)
|
|
|
|
|
stdout = result.stdout.read().decode()
|
|
|
|
|
stderr = result.stderr.read().decode()
|
|
|
|
|
|
|
|
|
|
# Parse output
|
|
|
|
|
result = {}
|
|
|
|
|
for p in stdout.split('\n'):
|
|
|
|
|
if p:
|
|
|
|
|
key, val = p.split(':', 1)
|
|
|
|
|
result[key] = val.strip()
|
|
|
|
|
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def pdftotext(pdf_file, **kwargs):
|
|
|
|
|
"""Call pdftotext (Poppler) with optional keyword arguments.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
pdf_file: path to pdf file
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
Extracted text
|
|
|
|
|
|
|
|
|
|
Example usage:
|
|
|
|
|
Extract text from 'docuent.pdf' from page 1 to page 10.
|
|
|
|
|
|
|
|
|
|
pdftotext('document.pdf', f=1, l=10)
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
# Parse command line arguments
|
|
|
|
|
args = parse_cli_args(kwargs)
|
|
|
|
|
|
|
|
|
|
command_str = ['pdftotext', *args, pdf_file, '-']
|
|
|
|
|
|
|
|
|
|
result = subprocess.Popen(
|
|
|
|
|
command_str,
|
|
|
|
|
stdout=subprocess.PIPE,
|
|
|
|
|
stderr=subprocess.PIPE,
|
|
|
|
|
bufsize=1, )
|
|
|
|
|
stdout = result.stdout.read().decode()
|
|
|
|
|
stderr = result.stderr.read().decode()
|
|
|
|
|
|
|
|
|
|
if stderr:
|
|
|
|
|
raise ValueError(stderr)
|
|
|
|
|
elif stdout:
|
|
|
|
|
return stdout
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def search_string(pattern, string, match_object, context_length):
|
|
|
|
|
"""Get details of SRE_Match object.
|
|
|
|
|
|
|
|
|
@ -67,42 +142,41 @@ def search_pdf(pdf_name, search_patterns, context_length):
|
|
|
|
|
search_patterns = [search_patterns]
|
|
|
|
|
|
|
|
|
|
search_results = []
|
|
|
|
|
with open(pdf_name, 'rb') as pdf:
|
|
|
|
|
pdf_shortname = os.path.split(pdf_name)[1]
|
|
|
|
|
reader = PyPDF2.PdfFileReader(pdf)
|
|
|
|
|
num_pages = reader.getNumPages()
|
|
|
|
|
|
|
|
|
|
# Set up tqdm progress bar
|
|
|
|
|
pbar = tqdm(range(num_pages))
|
|
|
|
|
for i in pbar:
|
|
|
|
|
pbar.set_description('Searching {}'.format(pdf_shortname))
|
|
|
|
|
page_num = i + 1
|
|
|
|
|
page = reader.getPage(i)
|
|
|
|
|
try:
|
|
|
|
|
page_text = page.extractText().replace('\n', ' ')
|
|
|
|
|
# Skip page if text cannot be read
|
|
|
|
|
except (KeyError, ZLibError):
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
for pattern in search_patterns:
|
|
|
|
|
if pattern == pattern.lower():
|
|
|
|
|
# Ignore case if pattern is all lowercase
|
|
|
|
|
matches = re.finditer(pattern, page_text, re.IGNORECASE)
|
|
|
|
|
else:
|
|
|
|
|
# Respect case if pattern is mixed or uppercase
|
|
|
|
|
matches = re.finditer(pattern, page_text)
|
|
|
|
|
|
|
|
|
|
for match in matches:
|
|
|
|
|
whole_word, context = search_string(
|
|
|
|
|
pattern, page_text, match, context_length)
|
|
|
|
|
|
|
|
|
|
# Update search results
|
|
|
|
|
search_results.append({
|
|
|
|
|
'document': pdf_name,
|
|
|
|
|
'page': page_num,
|
|
|
|
|
'pattern': pattern,
|
|
|
|
|
'word': whole_word,
|
|
|
|
|
'context': context,
|
|
|
|
|
})
|
|
|
|
|
pdf_shortname = os.path.split(pdf_name)[1]
|
|
|
|
|
num_pages = int(pdfinfo(pdf_name)['Pages'])
|
|
|
|
|
|
|
|
|
|
# Set up tqdm progress bar
|
|
|
|
|
pbar = tqdm(range(num_pages))
|
|
|
|
|
for i in pbar:
|
|
|
|
|
pbar.set_description('Searching {}'.format(pdf_shortname))
|
|
|
|
|
page_num = i + 1
|
|
|
|
|
|
|
|
|
|
# Extract raw text on current page with pdftotext
|
|
|
|
|
page_text = pdftotext(pdf_name, f=page_num, l=page_num)
|
|
|
|
|
|
|
|
|
|
# Remove unicode control characters
|
|
|
|
|
page_text = re.sub('[\x02\x03]', ' ', page_text)
|
|
|
|
|
page_text = re.sub('\r', '', page_text)
|
|
|
|
|
|
|
|
|
|
for pattern in search_patterns:
|
|
|
|
|
if pattern == pattern.lower():
|
|
|
|
|
# Ignore case if pattern is all lowercase
|
|
|
|
|
matches = re.finditer(pattern, page_text, re.IGNORECASE)
|
|
|
|
|
else:
|
|
|
|
|
# Respect case if pattern is mixed or uppercase
|
|
|
|
|
matches = re.finditer(pattern, page_text)
|
|
|
|
|
|
|
|
|
|
for match in matches:
|
|
|
|
|
whole_word, context = search_string(
|
|
|
|
|
pattern, page_text, match, context_length)
|
|
|
|
|
|
|
|
|
|
# Update search results
|
|
|
|
|
search_results.append({
|
|
|
|
|
'document': pdf_name,
|
|
|
|
|
'page': page_num,
|
|
|
|
|
'pattern': pattern,
|
|
|
|
|
'word': whole_word,
|
|
|
|
|
'context': context,
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
return search_results
|
|
|
|
|