Remove PyPDF2 dependency (use Poppler:pdftotext instead)

7 years ago · cd7ddad2be
parent c67fe76722
commit cd7ddad2be
1 changed files with 114 additions and 40 deletions
--- a/pdfsearch/pdfsearch.py
+++ b/pdfsearch/pdfsearch.py
@ -1,11 +1,86 @@
 import os
 import re
-import PyPDF2
-from PyPDF2.utils import PdfReadError
-from zlib import error as ZLibError
+import subprocess
 from tqdm import tqdm


+def parse_cli_args(kwargs):
+    args = []
+    for opt, val in kwargs.items():
+        # Get option name
+        if val:
+            args.append('-' + opt)
+            if val is not True:
+                # Include value if option is not a boolean switch
+                args.append(str(val))
+    return args
+
+
+def pdfinfo(pdf_file, **kwargs):
+    """Call pdfinfo (Poppler) with optional keyword arguments.
+
+    Args:
+        pdf_file: path to pdf file
+
+    Returns:
+        Dictionary containing pdf info
+    """
+
+    # Parse command line arguments
+    args = parse_cli_args(kwargs)
+
+    # Collect command line arguments
+    command_str = ['pdfinfo', '-isodates', *args, pdf_file]
+
+    result = subprocess.Popen(
+        command_str, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=1)
+    stdout = result.stdout.read().decode()
+    stderr = result.stderr.read().decode()
+
+    # Parse output
+    result = {}
+    for p in stdout.split('\n'):
+        if p:
+            key, val = p.split(':', 1)
+            result[key] = val.strip()
+
+    return result
+
+
+def pdftotext(pdf_file, **kwargs):
+    """Call pdftotext (Poppler) with optional keyword arguments.
+
+    Args:
+        pdf_file: path to pdf file
+
+    Returns:
+        Extracted text
+
+    Example usage:
+        Extract text from 'docuent.pdf' from page 1 to page 10.
+
+        pdftotext('document.pdf', f=1, l=10)
+    """
+
+    # Parse command line arguments
+    args = parse_cli_args(kwargs)
+
+    command_str = ['pdftotext', *args, pdf_file, '-']
+
+    result = subprocess.Popen(
+        command_str,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        bufsize=1, )
+    stdout = result.stdout.read().decode()
+    stderr = result.stderr.read().decode()
+
+    if stderr:
+        raise ValueError(stderr)
+    elif stdout:
+        return stdout
+
+
 def search_string(pattern, string, match_object, context_length):
    """Get details of SRE_Match object.

@ -67,42 +142,41 @@ def search_pdf(pdf_name, search_patterns, context_length):
        search_patterns = [search_patterns]

    search_results = []
-    with open(pdf_name, 'rb') as pdf:
-        pdf_shortname = os.path.split(pdf_name)[1]
-        reader = PyPDF2.PdfFileReader(pdf)
-        num_pages = reader.getNumPages()
-
-        # Set up tqdm progress bar
-        pbar = tqdm(range(num_pages))
-        for i in pbar:
-            pbar.set_description('Searching {}'.format(pdf_shortname))
-            page_num = i + 1
-            page = reader.getPage(i)
-            try:
-                page_text = page.extractText().replace('\n', ' ')
-            # Skip page if text cannot be read
-            except (KeyError, ZLibError):
-                continue
-
-            for pattern in search_patterns:
-                if pattern == pattern.lower():
-                    # Ignore case if pattern is all lowercase
-                    matches = re.finditer(pattern, page_text, re.IGNORECASE)
-                else:
-                    # Respect case if pattern is mixed or uppercase
-                    matches = re.finditer(pattern, page_text)
-
-                for match in matches:
-                    whole_word, context = search_string(
-                        pattern, page_text, match, context_length)
-
-                    # Update search results
-                    search_results.append({
-                        'document': pdf_name,
-                        'page': page_num,
-                        'pattern': pattern,
-                        'word': whole_word,
-                        'context': context,
-                    })
+    pdf_shortname = os.path.split(pdf_name)[1]
+    num_pages = int(pdfinfo(pdf_name)['Pages'])
+
+    # Set up tqdm progress bar
+    pbar = tqdm(range(num_pages))
+    for i in pbar:
+        pbar.set_description('Searching {}'.format(pdf_shortname))
+        page_num = i + 1
+
+        # Extract raw text on current page with pdftotext
+        page_text = pdftotext(pdf_name, f=page_num, l=page_num)
+
+        # Remove unicode control characters
+        page_text = re.sub('[\x02\x03]', ' ', page_text)
+        page_text = re.sub('\r', '', page_text)
+
+        for pattern in search_patterns:
+            if pattern == pattern.lower():
+                # Ignore case if pattern is all lowercase
+                matches = re.finditer(pattern, page_text, re.IGNORECASE)
+            else:
+                # Respect case if pattern is mixed or uppercase
+                matches = re.finditer(pattern, page_text)
+
+            for match in matches:
+                whole_word, context = search_string(
+                    pattern, page_text, match, context_length)
+
+                # Update search results
+                search_results.append({
+                    'document': pdf_name,
+                    'page': page_num,
+                    'pattern': pattern,
+                    'word': whole_word,
+                    'context': context,
+                })

    return search_results