Remove PyPDF2 dependency (use Poppler:pdftotext instead)

8 years ago · cd7ddad2be
parent c67fe76722
commit cd7ddad2be
1 changed files with 114 additions and 40 deletions
--- a/pdfsearch/pdfsearch.py
+++ b/pdfsearch/pdfsearch.py
@ -1,11 +1,86 @@
 import os
 import re
-import PyPDF2
+import subprocess
 from PyPDF2.utils import PdfReadError
 from zlib import error as ZLibError
 from tqdm import tqdm
 def parse_cli_args(kwargs):
    args = []
    for opt, val in kwargs.items():
        # Get option name
        if val:
            args.append('-' + opt)
            if val is not True:
                # Include value if option is not a boolean switch
                args.append(str(val))
    return args
 def pdfinfo(pdf_file, **kwargs):
    """Call pdfinfo (Poppler) with optional keyword arguments.
    Args:
        pdf_file: path to pdf file
    Returns:
        Dictionary containing pdf info
    """
    # Parse command line arguments
    args = parse_cli_args(kwargs)
    # Collect command line arguments
    command_str = ['pdfinfo', '-isodates', *args, pdf_file]
    result = subprocess.Popen(
        command_str, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=1)
    stdout = result.stdout.read().decode()
    stderr = result.stderr.read().decode()
    # Parse output
    result = {}
    for p in stdout.split('\n'):
        if p:
            key, val = p.split(':', 1)
            result[key] = val.strip()
    return result
 def pdftotext(pdf_file, **kwargs):
    """Call pdftotext (Poppler) with optional keyword arguments.
    Args:
        pdf_file: path to pdf file
    Returns:
        Extracted text
    Example usage:
        Extract text from 'docuent.pdf' from page 1 to page 10.
        pdftotext('document.pdf', f=1, l=10)
    """
    # Parse command line arguments
    args = parse_cli_args(kwargs)
    command_str = ['pdftotext', *args, pdf_file, '-']
    result = subprocess.Popen(
        command_str,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        bufsize=1, )
    stdout = result.stdout.read().decode()
    stderr = result.stderr.read().decode()
    if stderr:
        raise ValueError(stderr)
    elif stdout:
        return stdout
 def search_string(pattern, string, match_object, context_length):
    """Get details of SRE_Match object.
@ -67,22 +142,21 @@ def search_pdf(pdf_name, search_patterns, context_length):
        search_patterns = [search_patterns]
    search_results = []
    with open(pdf_name, 'rb') as pdf:
    pdf_shortname = os.path.split(pdf_name)[1]
-        reader = PyPDF2.PdfFileReader(pdf)
+    num_pages = int(pdfinfo(pdf_name)['Pages'])
        num_pages = reader.getNumPages()
    # Set up tqdm progress bar
    pbar = tqdm(range(num_pages))
    for i in pbar:
        pbar.set_description('Searching {}'.format(pdf_shortname))
        page_num = i + 1
-            page = reader.getPage(i)
+
-            try:
+        # Extract raw text on current page with pdftotext
-                page_text = page.extractText().replace('\n', ' ')
+        page_text = pdftotext(pdf_name, f=page_num, l=page_num)
-            # Skip page if text cannot be read
+
-            except (KeyError, ZLibError):
+        # Remove unicode control characters
-                continue
+        page_text = re.sub('[\x02\x03]', ' ', page_text)
        page_text = re.sub('\r', '', page_text)
        for pattern in search_patterns:
            if pattern == pattern.lower():