pdfunlock/scripts/pdf_unlock.py

"""Remove security features from pdfs.

This script removes security features of pdfs inside a specified folder,
including restrictions on:

- copying text and images
- printing the document

usage: pdf_unlock.py [-h] [-r] [-o] folder

positional arguments:
  folder           name of input folder


optional arguments:
  -h, --help       show this help message and exit
  -r, --recursive  search for files recursively
  -o, --overwrite  overwrite original files

Examples:

Search for pdfs inside 'pdf_folder', then create new unlocked versions
of the pdfs with the the suffix '-unlocked.pdf'.
> python pdf_unlock.py pdf_folder

Search for pdfs inside 'pdf_folder', then unlock the pdfs and overwrite
the original versions.
> python pdf_unlock.py pdf_folder -o

Search recursively for pdfs inside 'pdf_folder' and all subfolders,
then unlock the pdfs and overwrite the original versions.
> python pdf_unlock.py pdf_folder -o -r

"""

__author__ = "D. Howe"
__version__ = "0.2.0"
__email__ = "d.howe@wrl.unsw.edu.au"

import os
import glob
import shutil
import argparse
import subprocess
from tqdm import tqdm


def pdfinfo(pdf_name, *args):
    """Get pdf info using the poppler tool 'pdfinfo'.

    Args:
        pdf_name: path to pdf file
        args:     list of optional arguments, e.g. ['-isodates', '-v']

    Returns:
        A dictionary containing the lines of stdout from calling 'pdfinfo'
    """
    command_str = ['pdfinfo', *args, pdf_name]
    result = subprocess.run(command_str, stdout=subprocess.PIPE)
    stdout = result.stdout.decode('utf-8').split('\n')

    pdf_info = {}
    for line in stdout:
        if line:
            key, val = line.split(':', 1)
            pdf_info[key] = val.strip()

    return pdf_info


def isencrypted(pdf_name):
    """Check if a pdf is encrypted.

    Args:
        pdf_name: path to pdf file

    Returns:
        True if encrypted, otherwise False
        
    Raises:
        AttributeError if pdf info cannot be obtained
    """
    try:
        pdf_info = pdfinfo(pdf_name)
        status = pdf_info['Encrypted'].split(' ')[0]
        if status == 'yes':
            return True
        elif status == 'no':
            return False
        else:
            raise ValueError
    except (SyntaxError, KeyError, ValueError):
        raise AttributeError('Could not read pdf info.')


def pdf_unlock(pdf_file, overwrite=False):
    """Rewrite pdf with Ghostscript, removing encryption.

    Args:
        pdf_file:  path to pdf file
        overwrite: boolean flag

    Returns:
        A dictionary containing the lines of stdout from calling 'pdfinfo'
    """
    # Create suffix for unlocked pdfs
    suffix = '-unlocked'

    # Strip file extension
    input_name = os.path.splitext(pdf_file)[0]
    output_name = input_name + suffix

    # Skip file if it has already been processed
    if input_name.endswith(suffix) or os.path.exists(output_name + '.pdf'):
        return

    # Skip file if it is not encrypted
    if not isencrypted(pdf_file):
        return

    # Get name of Ghostscript executable
    if os.sys.platform == 'win32':
        exec_name = 'gswin64c'
    else:
        exec_name = 'gs'

    # Create Ghostscript command string
    cmd_str = ('{} '
               '-dSAFER '
               '-dNOPAUSE '
               '-dQUIET '
               '-dBATCH '
               '-sPDFPassword= '
               '-sDEVICE=pdfwrite '
               '-sOutputFile="{}.pdf" '
               '"{}.pdf"').format(exec_name, output_name, input_name)

    with subprocess.Popen(
            cmd_str, stdout=subprocess.PIPE, bufsize=1,
            universal_newlines=True) as p:
        for line in p.stdout:
            print(line, end='')

    # Overwrite original file
    if overwrite:
        shutil.move(output_name + '.pdf', input_name + '.pdf')


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('folder', help='name of input folder', default=None)
    parser.add_argument(
        '-r',
        '--recursive',
        help='search for files recursively',
        action='store_true')
    parser.add_argument(
        '-o',
        '--overwrite',
        help='overwrite original files',
        action='store_true')
    args = parser.parse_args()

    # Get pdf files
    if args.recursive:
        glob_str = args.folder + '/**/*.pdf'
    else:
        glob_str = args.folder + '/*.pdf'

    pdf_files = glob.glob(glob_str, recursive=args.recursive)

    pbar = tqdm(pdf_files)
    for pdf_file in pbar:
        pdf_file_shortname = os.path.split(pdf_file)[1]
        pbar.set_description('Processing {}'.format(pdf_file_shortname))
        pdf_unlock(pdf_file, overwrite=args.overwrite)


if __name__ == '__main__':
    main()
Add argument parser and documentation 7 years ago			`"""Remove security features from pdfs.`

			`This script removes security features of pdfs inside a specified folder,`
			`including restrictions on:`

			`- copying text and images`
			`- printing the document`

			`usage: pdf_unlock.py [-h] [-r] [-o] folder`

			`positional arguments:`
			`folder name of input folder`


			`optional arguments:`
			`-h, --help show this help message and exit`
			`-r, --recursive search for files recursively`
			`-o, --overwrite overwrite original files`

			`Examples:`

			`Search for pdfs inside 'pdf_folder', then create new unlocked versions`
			`of the pdfs with the the suffix '-unlocked.pdf'.`
			`> python pdf_unlock.py pdf_folder`

			`Search for pdfs inside 'pdf_folder', then unlock the pdfs and overwrite`
			`the original versions.`
			`> python pdf_unlock.py pdf_folder -o`

			`Search recursively for pdfs inside 'pdf_folder' and all subfolders,`
			`then unlock the pdfs and overwrite the original versions.`
			`> python pdf_unlock.py pdf_folder -o -r`

			`"""`

Initial commit 7 years ago			`__author__ = "D. Howe"`
Use 'poppler' to check if pdfs are encrypted 7 years ago			`__version__ = "0.2.0"`
Initial commit 7 years ago			`__email__ = "d.howe@wrl.unsw.edu.au"`

			`import os`
			`import glob`
Add progress bar 7 years ago			`import shutil`
Initial commit 7 years ago			`import argparse`
			`import subprocess`
Add progress bar 7 years ago			`from tqdm import tqdm`
Initial commit 7 years ago
Use 'poppler' to check if pdfs are encrypted 7 years ago
			`def pdfinfo(pdf_name, *args):`
			`"""Get pdf info using the poppler tool 'pdfinfo'.`

			`Args:`
			`pdf_name: path to pdf file`
			`args: list of optional arguments, e.g. ['-isodates', '-v']`

			`Returns:`
			`A dictionary containing the lines of stdout from calling 'pdfinfo'`
			`"""`
			`command_str = ['pdfinfo', *args, pdf_name]`
			`result = subprocess.run(command_str, stdout=subprocess.PIPE)`
			`stdout = result.stdout.decode('utf-8').split('\n')`

			`pdf_info = {}`
			`for line in stdout:`
			`if line:`
			`key, val = line.split(':', 1)`
			`pdf_info[key] = val.strip()`

			`return pdf_info`


			`def isencrypted(pdf_name):`
			`"""Check if a pdf is encrypted.`

			`Args:`
			`pdf_name: path to pdf file`

			`Returns:`
			`True if encrypted, otherwise False`
Add error handling for invalid pdfs 7 years ago
			`Raises:`
			`AttributeError if pdf info cannot be obtained`
Use 'poppler' to check if pdfs are encrypted 7 years ago			`"""`
Add error handling for invalid pdfs 7 years ago			`try:`
			`pdf_info = pdfinfo(pdf_name)`
			`status = pdf_info['Encrypted'].split(' ')[0]`
			`if status == 'yes':`
			`return True`
			`elif status == 'no':`
			`return False`
			`else:`
			`raise ValueError`
			`except (SyntaxError, KeyError, ValueError):`
			`raise AttributeError('Could not read pdf info.')`
Use 'poppler' to check if pdfs are encrypted 7 years ago

Add argument parser and documentation 7 years ago			`def pdf_unlock(pdf_file, overwrite=False):`
Use 'poppler' to check if pdfs are encrypted 7 years ago			`"""Rewrite pdf with Ghostscript, removing encryption.`

			`Args:`
			`pdf_file: path to pdf file`
			`overwrite: boolean flag`

			`Returns:`
			`A dictionary containing the lines of stdout from calling 'pdfinfo'`
			`"""`
Add argument parser and documentation 7 years ago			`# Create suffix for unlocked pdfs`
			`suffix = '-unlocked'`
Initial commit 7 years ago
Add argument parser and documentation 7 years ago			`# Strip file extension`
			`input_name = os.path.splitext(pdf_file)[0]`
			`output_name = input_name + suffix`
Initial commit 7 years ago
Add argument parser and documentation 7 years ago			`# Skip file if it has already been processed`
Use 'poppler' to check if pdfs are encrypted 7 years ago			`if input_name.endswith(suffix) or os.path.exists(output_name + '.pdf'):`
			`return`

			`# Skip file if it is not encrypted`
			`if not isencrypted(pdf_file):`
Add argument parser and documentation 7 years ago			`return`
Initial commit 7 years ago
Get platform-specific Ghostscript executable name 7 years ago			`# Get name of Ghostscript executable`
			`if os.sys.platform == 'win32':`
			`exec_name = 'gswin64c'`
			`else:`
			`exec_name = 'gs'`

			`# Create Ghostscript command string`
			`cmd_str = ('{} '`
Add argument parser and documentation 7 years ago			`'-dSAFER '`
			`'-dNOPAUSE '`
			`'-dQUIET '`
			`'-dBATCH '`
			`'-sPDFPassword= '`
			`'-sDEVICE=pdfwrite '`
			`'-sOutputFile="{}.pdf" '`
Get platform-specific Ghostscript executable name 7 years ago			`'"{}.pdf"').format(exec_name, output_name, input_name)`
Initial commit 7 years ago
Add argument parser and documentation 7 years ago			`with subprocess.Popen(`
			`cmd_str, stdout=subprocess.PIPE, bufsize=1,`
			`universal_newlines=True) as p:`
			`for line in p.stdout:`
			`print(line, end='')`
Initial commit 7 years ago
Add argument parser and documentation 7 years ago			`# Overwrite original file`
			`if overwrite:`
			`shutil.move(output_name + '.pdf', input_name + '.pdf')`
Initial commit 7 years ago

			`def main():`
			`parser = argparse.ArgumentParser()`
			`parser.add_argument('folder', help='name of input folder', default=None)`
			`parser.add_argument(`
			`'-r',`
			`'--recursive',`
			`help='search for files recursively',`
			`action='store_true')`
Add argument parser and documentation 7 years ago			`parser.add_argument(`
			`'-o',`
			`'--overwrite',`
			`help='overwrite original files',`
			`action='store_true')`
Initial commit 7 years ago			`args = parser.parse_args()`

Add argument parser and documentation 7 years ago			`# Get pdf files`
			`if args.recursive:`
			`glob_str = args.folder + '/*/.pdf'`
			`else:`
			`glob_str = args.folder + '/*.pdf'`

			`pdf_files = glob.glob(glob_str, recursive=args.recursive)`

Add progress bar 7 years ago			`pbar = tqdm(pdf_files)`
			`for pdf_file in pbar:`
			`pdf_file_shortname = os.path.split(pdf_file)[1]`
			`pbar.set_description('Processing {}'.format(pdf_file_shortname))`
Add argument parser and documentation 7 years ago			`pdf_unlock(pdf_file, overwrite=args.overwrite)`
Initial commit 7 years ago

			`if __name__ == '__main__':`
			`main()`