Remove PyPDF2 dependency (use Poppler:pdftotext instead)

master
Dan Howe 7 years ago
parent c67fe76722
commit cd7ddad2be

@ -1,11 +1,86 @@
import os import os
import re import re
import PyPDF2 import subprocess
from PyPDF2.utils import PdfReadError
from zlib import error as ZLibError
from tqdm import tqdm from tqdm import tqdm
def parse_cli_args(kwargs):
args = []
for opt, val in kwargs.items():
# Get option name
if val:
args.append('-' + opt)
if val is not True:
# Include value if option is not a boolean switch
args.append(str(val))
return args
def pdfinfo(pdf_file, **kwargs):
"""Call pdfinfo (Poppler) with optional keyword arguments.
Args:
pdf_file: path to pdf file
Returns:
Dictionary containing pdf info
"""
# Parse command line arguments
args = parse_cli_args(kwargs)
# Collect command line arguments
command_str = ['pdfinfo', '-isodates', *args, pdf_file]
result = subprocess.Popen(
command_str, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=1)
stdout = result.stdout.read().decode()
stderr = result.stderr.read().decode()
# Parse output
result = {}
for p in stdout.split('\n'):
if p:
key, val = p.split(':', 1)
result[key] = val.strip()
return result
def pdftotext(pdf_file, **kwargs):
"""Call pdftotext (Poppler) with optional keyword arguments.
Args:
pdf_file: path to pdf file
Returns:
Extracted text
Example usage:
Extract text from 'docuent.pdf' from page 1 to page 10.
pdftotext('document.pdf', f=1, l=10)
"""
# Parse command line arguments
args = parse_cli_args(kwargs)
command_str = ['pdftotext', *args, pdf_file, '-']
result = subprocess.Popen(
command_str,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
bufsize=1, )
stdout = result.stdout.read().decode()
stderr = result.stderr.read().decode()
if stderr:
raise ValueError(stderr)
elif stdout:
return stdout
def search_string(pattern, string, match_object, context_length): def search_string(pattern, string, match_object, context_length):
"""Get details of SRE_Match object. """Get details of SRE_Match object.
@ -67,22 +142,21 @@ def search_pdf(pdf_name, search_patterns, context_length):
search_patterns = [search_patterns] search_patterns = [search_patterns]
search_results = [] search_results = []
with open(pdf_name, 'rb') as pdf:
pdf_shortname = os.path.split(pdf_name)[1] pdf_shortname = os.path.split(pdf_name)[1]
reader = PyPDF2.PdfFileReader(pdf) num_pages = int(pdfinfo(pdf_name)['Pages'])
num_pages = reader.getNumPages()
# Set up tqdm progress bar # Set up tqdm progress bar
pbar = tqdm(range(num_pages)) pbar = tqdm(range(num_pages))
for i in pbar: for i in pbar:
pbar.set_description('Searching {}'.format(pdf_shortname)) pbar.set_description('Searching {}'.format(pdf_shortname))
page_num = i + 1 page_num = i + 1
page = reader.getPage(i)
try: # Extract raw text on current page with pdftotext
page_text = page.extractText().replace('\n', ' ') page_text = pdftotext(pdf_name, f=page_num, l=page_num)
# Skip page if text cannot be read
except (KeyError, ZLibError): # Remove unicode control characters
continue page_text = re.sub('[\x02\x03]', ' ', page_text)
page_text = re.sub('\r', '', page_text)
for pattern in search_patterns: for pattern in search_patterns:
if pattern == pattern.lower(): if pattern == pattern.lower():

Loading…
Cancel
Save