"""Remove security features from pdfs. This script removes security features of pdfs inside a specified folder, including restrictions on: - copying text and images - printing the document usage: pdf_unlock.py [-h] [-r] [-o] folder positional arguments: folder name of input folder optional arguments: -h, --help show this help message and exit -r, --recursive search for files recursively -o, --overwrite overwrite original files Examples: Search for pdfs inside 'pdf_folder', then create new unlocked versions of the pdfs with the the suffix '-unlocked.pdf'. > python pdf_unlock.py pdf_folder Search for pdfs inside 'pdf_folder', then unlock the pdfs and overwrite the original versions. > python pdf_unlock.py pdf_folder -o Search recursively for pdfs inside 'pdf_folder' and all subfolders, then unlock the pdfs and overwrite the original versions. > python pdf_unlock.py pdf_folder -o -r """ __author__ = "D. Howe" __version__ = "0.2.0" __email__ = "d.howe@wrl.unsw.edu.au" import os import glob import shutil import argparse import subprocess from tqdm import tqdm def pdfinfo(pdf_name, *args): """Get pdf info using the poppler tool 'pdfinfo'. Args: pdf_name: path to pdf file args: list of optional arguments, e.g. ['-isodates', '-v'] Returns: A dictionary containing the lines of stdout from calling 'pdfinfo' """ command_str = ['pdfinfo', *args, pdf_name] result = subprocess.run( command_str, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) stdout = result.stdout.decode('utf-8').split('\n') pdf_info = {} for line in stdout: if line: key, val = line.split(':', 1) pdf_info[key] = val.strip() return pdf_info def isencrypted(pdf_name): """Check if a pdf is encrypted. Args: pdf_name: path to pdf file Returns: True if encrypted, otherwise False Raises: AttributeError if pdf info cannot be obtained """ try: pdf_info = pdfinfo(pdf_name) status = pdf_info['Encrypted'].split(' ')[0] if status == 'yes': return True elif status == 'no': return False else: raise ValueError except (SyntaxError, KeyError, ValueError): raise AttributeError('Could not read pdf info.') def pdf_unlock(pdf_file, overwrite=False): """Rewrite pdf with Ghostscript, removing encryption. Args: pdf_file: path to pdf file overwrite: boolean flag Returns: A dictionary containing the lines of stdout from calling 'pdfinfo' """ # Create suffix for unlocked pdfs suffix = '-unlocked' # Strip file extension input_name = os.path.splitext(pdf_file)[0] output_name = input_name + suffix # Skip file if it has already been processed if input_name.endswith(suffix) or os.path.exists(output_name + '.pdf'): return # Skip file if it is not encrypted if not isencrypted(pdf_file): return # Get name of Ghostscript executable if os.sys.platform == 'win32': exec_name = 'gswin64c' else: exec_name = 'gs' # Create Ghostscript command string cmd_str = ('{} ' '-dSAFER ' '-dNOPAUSE ' '-dQUIET ' '-dBATCH ' '-sPDFPassword= ' '-sDEVICE=pdfwrite ' '-sOutputFile="{}.pdf" ' '"{}.pdf"').format(exec_name, output_name, input_name) with subprocess.Popen( cmd_str, stdout=subprocess.PIPE, bufsize=1, universal_newlines=True) as p: for line in p.stdout: print(line, end='') # Overwrite original file if overwrite: shutil.move(output_name + '.pdf', input_name + '.pdf') def main(): parser = argparse.ArgumentParser() parser.add_argument('folder', help='name of input folder', default=None) parser.add_argument( '-r', '--recursive', help='search for files recursively', action='store_true') parser.add_argument( '-o', '--overwrite', help='overwrite original files', action='store_true') args = parser.parse_args() # Get pdf files if args.recursive: glob_str = args.folder + '/**/*.pdf' else: glob_str = args.folder + '/*.pdf' pdf_files = glob.glob(glob_str, recursive=args.recursive) pbar = tqdm(pdf_files) for pdf_file in pbar: pdf_file_shortname = os.path.split(pdf_file)[1] pbar.set_description('Processing {}'.format(pdf_file_shortname)) pdf_unlock(pdf_file, overwrite=args.overwrite) if __name__ == '__main__': main()