From 835790a54254e1e0a954b844a5fe79fcd577d8a7 Mon Sep 17 00:00:00 2001 From: Dan Howe Date: Thu, 12 Apr 2018 09:12:26 +1000 Subject: [PATCH] Use 'poppler' to check if pdfs are encrypted --- scripts/pdf_unlock.py | 60 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 58 insertions(+), 2 deletions(-) diff --git a/scripts/pdf_unlock.py b/scripts/pdf_unlock.py index b89f898..dfd780a 100644 --- a/scripts/pdf_unlock.py +++ b/scripts/pdf_unlock.py @@ -34,7 +34,7 @@ then unlock the pdfs and overwrite the original versions. """ __author__ = "D. Howe" -__version__ = "0.1.0" +__version__ = "0.2.0" __email__ = "d.howe@wrl.unsw.edu.au" import os @@ -44,7 +44,58 @@ import argparse import subprocess from tqdm import tqdm + +def pdfinfo(pdf_name, *args): + """Get pdf info using the poppler tool 'pdfinfo'. + + Args: + pdf_name: path to pdf file + args: list of optional arguments, e.g. ['-isodates', '-v'] + + Returns: + A dictionary containing the lines of stdout from calling 'pdfinfo' + """ + command_str = ['pdfinfo', *args, pdf_name] + result = subprocess.run(command_str, stdout=subprocess.PIPE) + stdout = result.stdout.decode('utf-8').split('\n') + + pdf_info = {} + for line in stdout: + if line: + key, val = line.split(':', 1) + pdf_info[key] = val.strip() + + return pdf_info + + +def isencrypted(pdf_name): + """Check if a pdf is encrypted. + + Args: + pdf_name: path to pdf file + + Returns: + True if encrypted, otherwise False + """ + pdf_info = pdfinfo(pdf_name) + status = pdf_info['Encrypted'].split(' ')[0] + + if status == 'yes': + return True + else: + return False + + def pdf_unlock(pdf_file, overwrite=False): + """Rewrite pdf with Ghostscript, removing encryption. + + Args: + pdf_file: path to pdf file + overwrite: boolean flag + + Returns: + A dictionary containing the lines of stdout from calling 'pdfinfo' + """ # Create suffix for unlocked pdfs suffix = '-unlocked' @@ -53,7 +104,12 @@ def pdf_unlock(pdf_file, overwrite=False): output_name = input_name + suffix # Skip file if it has already been processed - if input_name.endswith(suffix): + if input_name.endswith(suffix) or os.path.exists(output_name + '.pdf'): + return + + # Skip file if it is not encrypted + if not isencrypted(pdf_file): + print('file skipped') return # Get name of Ghostscript executable