Improve handling of unicode control characters.

8 years ago · ff6c503684
parent ec7ebdf1b6
commit ff6c503684
1 changed files with 25 additions and 6 deletions
--- a/pdfsearch/pdfsearch.py
+++ b/pdfsearch/pdfsearch.py
@ -4,7 +4,26 @@ import subprocess
 from tqdm import tqdm


-def parse_cli_args(kwargs):
+def parse_cli_args(**kwargs):
+    """Prepare arguments for command line.
+
+    Args:
+        kwargs: dictionary containing option names and values
+
+    Returns:
+        List of arguments to pass to subprocess
+
+    Example usage:
+      > parse_cli_args(o='output.txt')
+      ['-o', 'output.txt']
+
+      > parse_cli_args(f=1, l=2)
+      ['-f', '1', '-l', '2']
+
+      > parse_cli_args(h=True)
+      ['-h']
+    """
+
    args = []
    for opt, val in kwargs.items():
        # Get option name
@ -27,7 +46,7 @@ def pdfinfo(pdf_file, **kwargs):
    """

    # Parse command line arguments
-    args = parse_cli_args(kwargs)
+    args = parse_cli_args(**kwargs)

    # Collect command line arguments
    command_str = ['pdfinfo', '-isodates', *args, pdf_file]
@ -63,7 +82,7 @@ def pdftotext(pdf_file, **kwargs):
    """

    # Parse command line arguments
-    args = parse_cli_args(kwargs)
+    args = parse_cli_args(**kwargs)

    command_str = ['pdftotext', *args, pdf_file, '-']

@ -71,7 +90,7 @@ def pdftotext(pdf_file, **kwargs):
        command_str,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
-        bufsize=1, )
+        bufsize=1)
    stdout = result.stdout.read().decode()
    stderr = result.stderr.read().decode()

@ -155,8 +174,8 @@ def search_pdf(pdf_name, search_patterns, context_length):
        page_text = pdftotext(pdf_name, f=page_num, l=page_num)

        # Remove unicode control characters
-        page_text = re.sub('[\x02\x03]', ' ', page_text)
-        page_text = re.sub('\r', '', page_text)
+        page_text = re.sub('\r\n', '\n', page_text)
+        page_text = re.sub('[\x02\x03\n]', ' ', page_text)

        for pattern in search_patterns:
            if pattern == pattern.lower():