Remove PyPDF2 dependency (use Poppler:pdftotext instead)

master
Dan Howe 7 years ago
parent c67fe76722
commit cd7ddad2be

@ -1,11 +1,86 @@
import os
import re
import PyPDF2
from PyPDF2.utils import PdfReadError
from zlib import error as ZLibError
import subprocess
from tqdm import tqdm
def parse_cli_args(kwargs):
args = []
for opt, val in kwargs.items():
# Get option name
if val:
args.append('-' + opt)
if val is not True:
# Include value if option is not a boolean switch
args.append(str(val))
return args
def pdfinfo(pdf_file, **kwargs):
"""Call pdfinfo (Poppler) with optional keyword arguments.
Args:
pdf_file: path to pdf file
Returns:
Dictionary containing pdf info
"""
# Parse command line arguments
args = parse_cli_args(kwargs)
# Collect command line arguments
command_str = ['pdfinfo', '-isodates', *args, pdf_file]
result = subprocess.Popen(
command_str, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=1)
stdout = result.stdout.read().decode()
stderr = result.stderr.read().decode()
# Parse output
result = {}
for p in stdout.split('\n'):
if p:
key, val = p.split(':', 1)
result[key] = val.strip()
return result
def pdftotext(pdf_file, **kwargs):
"""Call pdftotext (Poppler) with optional keyword arguments.
Args:
pdf_file: path to pdf file
Returns:
Extracted text
Example usage:
Extract text from 'docuent.pdf' from page 1 to page 10.
pdftotext('document.pdf', f=1, l=10)
"""
# Parse command line arguments
args = parse_cli_args(kwargs)
command_str = ['pdftotext', *args, pdf_file, '-']
result = subprocess.Popen(
command_str,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
bufsize=1, )
stdout = result.stdout.read().decode()
stderr = result.stderr.read().decode()
if stderr:
raise ValueError(stderr)
elif stdout:
return stdout
def search_string(pattern, string, match_object, context_length):
"""Get details of SRE_Match object.
@ -67,42 +142,41 @@ def search_pdf(pdf_name, search_patterns, context_length):
search_patterns = [search_patterns]
search_results = []
with open(pdf_name, 'rb') as pdf:
pdf_shortname = os.path.split(pdf_name)[1]
reader = PyPDF2.PdfFileReader(pdf)
num_pages = reader.getNumPages()
# Set up tqdm progress bar
pbar = tqdm(range(num_pages))
for i in pbar:
pbar.set_description('Searching {}'.format(pdf_shortname))
page_num = i + 1
page = reader.getPage(i)
try:
page_text = page.extractText().replace('\n', ' ')
# Skip page if text cannot be read
except (KeyError, ZLibError):
continue
for pattern in search_patterns:
if pattern == pattern.lower():
# Ignore case if pattern is all lowercase
matches = re.finditer(pattern, page_text, re.IGNORECASE)
else:
# Respect case if pattern is mixed or uppercase
matches = re.finditer(pattern, page_text)
for match in matches:
whole_word, context = search_string(
pattern, page_text, match, context_length)
# Update search results
search_results.append({
'document': pdf_name,
'page': page_num,
'pattern': pattern,
'word': whole_word,
'context': context,
})
pdf_shortname = os.path.split(pdf_name)[1]
num_pages = int(pdfinfo(pdf_name)['Pages'])
# Set up tqdm progress bar
pbar = tqdm(range(num_pages))
for i in pbar:
pbar.set_description('Searching {}'.format(pdf_shortname))
page_num = i + 1
# Extract raw text on current page with pdftotext
page_text = pdftotext(pdf_name, f=page_num, l=page_num)
# Remove unicode control characters
page_text = re.sub('[\x02\x03]', ' ', page_text)
page_text = re.sub('\r', '', page_text)
for pattern in search_patterns:
if pattern == pattern.lower():
# Ignore case if pattern is all lowercase
matches = re.finditer(pattern, page_text, re.IGNORECASE)
else:
# Respect case if pattern is mixed or uppercase
matches = re.finditer(pattern, page_text)
for match in matches:
whole_word, context = search_string(
pattern, page_text, match, context_length)
# Update search results
search_results.append({
'document': pdf_name,
'page': page_num,
'pattern': pattern,
'word': whole_word,
'context': context,
})
return search_results

Loading…
Cancel
Save