major-projects-grabber/major_projects_grabber/major_projects_grabber.py

"""major_projects_grabber.py
Download documents from the NSW DPE Major Projects website.

Example usage:
    # Grab a single project modification using its job id, and save in 'files'
    major_projects_grabber -i 1746 -o files

    # Grab all modifications in search results page, and save in current folder
    major_projects_grabber -u "http://majorprojects.planning.nsw.gov.au/index.pl?action=search&authority_id=547"

"""

import os
import re
import sys
import shutil
import logging
import requests
import argparse
import warnings
from lxml import html
from tqdm import tqdm
from requests.exceptions import ConnectionError, InvalidURL


def make_safe(s):
    """Remove characters that would be invalid in a filepath"""
    # Remove trailing spaces
    s_safe = s.strip()

    # Remove '\', '*', '"', '<', '>' '|'
    s_safe = re.sub('\\\|\*|"|<|>\|', '', s_safe)

    # Replace '/' and ':' with '-'
    s_safe = re.sub(':', ' -', s_safe)
    s_safe = re.sub('/', '-', s_safe)

    return s_safe


def mod_ids_from_search(search_results_url):
    """Get modification job IDs from search results URL"""

    # Get HTML of search results page
    page = requests.get(search_results_url)

    # Create HTML tree
    tree = html.fromstring(page.content)

    # Find job ids of items in results list
    mod_ids = []
    mods = tree.xpath('//*[@class="vpa_list"]/tr/td[1]/a')
    for mod in mods:
        mod_ids.append(re.search('(?<=job_id=)\d+', mod.get('href')).group())

    return mod_ids


def get_document_list(mod_id, output_dir):
    """Get list of documents from project modification ID"""

    # Get html from mod page
    mod_url = ('http://majorprojects.planning.nsw.gov.au/'
               'index.pl?action=view_job&job_id=' + mod_id)
    mod_page = requests.get(mod_url)
    mod_tree = html.fromstring(mod_page.content)

    # Get mod details
    project_name = mod_tree.xpath(
        '//*[@class="vpa_header vpa_header_contrast"]/h2')[0].text
    mod_name = mod_tree.xpath(
        '//*[@class="vpa_header vpa_header_contrast"]/h1')[0].text

    # Get list of document folders
    folders = mod_tree.xpath('//div[@class="folder_row"]')

    # Remove invalid characters before creating folders
    project_name = make_safe(project_name)
    mod_name = make_safe(mod_name)

    # Create modification folder
    mod_dir = os.path.join(output_dir, project_name, mod_name)
    try:
        os.makedirs(mod_dir, exist_ok=True)
    except FileNotFoundError:
        # Fix destination path if longer than 255 characters (Windows only)
        mod_dir = '\\\\?\\' + os.path.abspath(mod_dir)
        os.makedirs(mod_dir, exist_ok=True)

    # Add note if no documents are found on portal
    if not folders:
        txt_name = 'No documents on DPE portal for this modification.txt'
        open(os.path.join(mod_dir, txt_name), 'a').close()

    # Create link to DPE Major Projects page for current modification
    text = """<html>
              <meta http-equiv="refresh" content="0; url={}">
              </html>""".format(mod_url)
    with open(os.path.join(mod_dir, 'DPE-portal-page.html'), 'w') as f:
        f.write(text)

    document_data = []
    for folder in folders:
        folder_name = folder.xpath('a[2]')[0].text.strip()

        # Get documents in current folder
        documents = folder.xpath('ul/li/a')
        for document in documents:
            doc = {}
            doc['url'] = document.get('href')
            doc['name'] = document.text
            doc['document_path'] = os.path.join(
                output_dir, project_name, mod_name, folder_name, doc['name'])
            document_data.append(doc)

    return document_data


def download_document(url, document_path):
    """Download document from given url"""

    # Check if destination path is too long (Windows filename limitation)
    try:
        open(document_path, 'a').close()
    except FileNotFoundError:
        document_path = '\\\\?\\' + os.path.abspath(document_path)

    # Create output directories as required
    os.makedirs(os.path.dirname(document_path), exist_ok=True)

    # Check if a non-empty file exists
    if os.path.isfile(document_path) and os.path.getsize(document_path) > 0:
        pass
    else:
        try:
            # Attempt to download file
            r = requests.get(url, stream=True)
        except (ConnectionError, InvalidURL):
            logging.error(
                ('Failed to download {4}\n'
                 '  Project: {1}\n'
                 '  Modification: {2}\n'
                 '  Folder: {3}\n').format(*document_path.split(os.sep)))
            return

        # Write file to disk
        with open(document_path, 'wb') as f:
            shutil.copyfileobj(r.raw, f)


def main():
    example_text = """examples:

    # Grab a single project modification using its job id, and save in 'files'
    major_projects_grabber -i 1746 -o files

    # Grab all modifications in search results page, and save in current folder
    major_projects_grabber -u "http://majorprojects.planning.nsw.gov.au/index.pl?action=search&authority_id=547"
    """

    # Set up command line arguments
    parser = argparse.ArgumentParser(
        epilog=example_text,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument(
        '-o', metavar='OUTPUT_DIR', default='.', help='root output directory')
    parser.add_argument(
        '-i',
        metavar='ID',
        default=[],
        help='modification job id(s)',
        nargs='*')
    parser.add_argument('-u', metavar='URL', help='url of search results page')

    # Print usage if no arguments are provided
    if len(sys.argv) == 1:
        parser.print_help(sys.stderr)
        sys.exit(1)

    # Parse arguments
    args = parser.parse_args()
    search_results_url = args.u
    output_dir = args.o
    mod_ids = args.i

    # Set up log File
    os.makedirs(output_dir, exist_ok=True)
    log_name = os.path.join(output_dir, 'errors.log')
    logging.basicConfig(filename=log_name, level=logging.ERROR)

    # Get mod IDs from search results
    if search_results_url:
        search_mod_ids = mod_ids_from_search(search_results_url)
        mod_ids.extend(search_mod_ids)

    # Get list of documents from given modification IDs
    mod_pbar = tqdm(mod_ids)
    for mod_id in mod_pbar:
        document_data = get_document_list(mod_id, output_dir)

        # Download documents for current modification
        doc_pbar = tqdm(document_data)

        for doc in doc_pbar:
            # Update progress bars
            mod_name = doc['document_path'].split(os.sep)[-3]
            doc_name = doc['document_path'].split(os.sep)[-1]
            mod_pbar.set_description(mod_name)
            doc_pbar.set_description(doc_name)

            # Download document
            download_document(doc['url'], doc['document_path'])

    # Tidy up console after tqdm
    print('\n')

    # Stop logging
    logging.shutdown()
    with open(log_name, 'r') as f:
        log_data = f.read()

    # Check contents of log file
    if log_data:
        warnings.warn(
            'Some files failed to download. See log for details.',
            stacklevel=2)
    else:
        os.remove(log_name)


if __name__ == '__main__':
    main()
Initial commit 6 years ago			`"""major_projects_grabber.py`
			`Download documents from the NSW DPE Major Projects website.`

			`Example usage:`
			`# Grab a single project modification using its job id, and save in 'files'`
Update docstring 6 years ago			`major_projects_grabber -i 1746 -o files`
Initial commit 6 years ago
			`# Grab all modifications in search results page, and save in current folder`
Update docstring 6 years ago			`major_projects_grabber -u "http://majorprojects.planning.nsw.gov.au/index.pl?action=search&authority_id=547"`
Initial commit 6 years ago
			`"""`

			`import os`
			`import re`
			`import sys`
			`import shutil`
			`import logging`
			`import requests`
			`import argparse`
Show warning if some files failed to download 6 years ago			`import warnings`
Initial commit 6 years ago			`from lxml import html`
Update docstrings 6 years ago			`from tqdm import tqdm`
Initial commit 6 years ago			`from requests.exceptions import ConnectionError, InvalidURL`


Replace invalid characters in project and mod names 6 years ago			`def make_safe(s):`
			`"""Remove characters that would be invalid in a filepath"""`
Remove trailing spaces in folder names 6 years ago			`# Remove trailing spaces`
			`s_safe = s.strip()`

Replace invalid characters in project and mod names 6 years ago			`# Remove '\', '*', '"', '<', '>' '\|'`
Remove trailing spaces in folder names 6 years ago			`s_safe = re.sub('\\\\|\*\|"\|<\|>\\|', '', s_safe)`
Replace invalid characters in project and mod names 6 years ago
			`# Replace '/' and ':' with '-'`
			`s_safe = re.sub(':', ' -', s_safe)`
			`s_safe = re.sub('/', '-', s_safe)`

			`return s_safe`


Add progress bars 6 years ago			`def mod_ids_from_search(search_results_url):`
			`"""Get modification job IDs from search results URL"""`

			`# Get HTML of search results page`
			`page = requests.get(search_results_url)`

			`# Create HTML tree`
			`tree = html.fromstring(page.content)`

			`# Find job ids of items in results list`
			`mod_ids = []`
			`mods = tree.xpath('//*[@class="vpa_list"]/tr/td[1]/a')`
			`for mod in mods:`
			`mod_ids.append(re.search('(?<=job_id=)\d+', mod.get('href')).group())`

			`return mod_ids`


			`def get_document_list(mod_id, output_dir):`
			`"""Get list of documents from project modification ID"""`
Initial commit 6 years ago
			`# Get html from mod page`
			`mod_url = ('http://majorprojects.planning.nsw.gov.au/'`
			`'index.pl?action=view_job&job_id=' + mod_id)`
			`mod_page = requests.get(mod_url)`
			`mod_tree = html.fromstring(mod_page.content)`

			`# Get mod details`
			`project_name = mod_tree.xpath(`
			`'//*[@class="vpa_header vpa_header_contrast"]/h2')[0].text`
			`mod_name = mod_tree.xpath(`
			`'//*[@class="vpa_header vpa_header_contrast"]/h1')[0].text`

			`# Get list of document folders`
			`folders = mod_tree.xpath('//div[@class="folder_row"]')`
Add note when no documents are available on DPE portal 6 years ago
Replace invalid characters in project and mod names 6 years ago			`# Remove invalid characters before creating folders`
			`project_name = make_safe(project_name)`
			`mod_name = make_safe(mod_name)`

Add note when no documents are available on DPE portal 6 years ago			`# Create modification folder`
			`mod_dir = os.path.join(output_dir, project_name, mod_name)`
Add support for folder paths longer than 255 characters (Windows) 6 years ago			`try:`
			`os.makedirs(mod_dir, exist_ok=True)`
			`except FileNotFoundError:`
			`# Fix destination path if longer than 255 characters (Windows only)`
			`mod_dir = '\\\\?\\' + os.path.abspath(mod_dir)`
			`os.makedirs(mod_dir, exist_ok=True)`
Add note when no documents are available on DPE portal 6 years ago
			`# Add note if no documents are found on portal`
			`if not folders:`
			`txt_name = 'No documents on DPE portal for this modification.txt'`
Update docstrings 6 years ago			`open(os.path.join(mod_dir, txt_name), 'a').close()`
Add note when no documents are available on DPE portal 6 years ago
			`# Create link to DPE Major Projects page for current modification`
			`text = """<html>`
			`<meta http-equiv="refresh" content="0; url={}">`
			`</html>""".format(mod_url)`
			`with open(os.path.join(mod_dir, 'DPE-portal-page.html'), 'w') as f:`
			`f.write(text)`

Add progress bars 6 years ago			`document_data = []`
Initial commit 6 years ago			`for folder in folders:`
			`folder_name = folder.xpath('a[2]')[0].text.strip()`

			`# Get documents in current folder`
			`documents = folder.xpath('ul/li/a')`
			`for document in documents:`
Add progress bars 6 years ago			`doc = {}`
			`doc['url'] = document.get('href')`
			`doc['name'] = document.text`
			`doc['document_path'] = os.path.join(`
			`output_dir, project_name, mod_name, folder_name, doc['name'])`
			`document_data.append(doc)`

			`return document_data`


			`def download_document(url, document_path):`
			`"""Download document from given url"""`
Add support for Windows file paths longer than 255 characters 6 years ago
Replace invalid characters in project and mod names 6 years ago			`# Check if destination path is too long (Windows filename limitation)`
Add progress bars 6 years ago			`try:`
Replace invalid characters in project and mod names 6 years ago			`open(document_path, 'a').close()`
			`except FileNotFoundError:`
			`document_path = '\\\\?\\' + os.path.abspath(document_path)`

			`# Create output directories as required`
			`os.makedirs(os.path.dirname(document_path), exist_ok=True)`
Add progress bars 6 years ago
Fix bug which downloaded only the first file in a folder 6 years ago			`# Check if a non-empty file exists`
			`if os.path.isfile(document_path) and os.path.getsize(document_path) > 0:`
Add progress bars 6 years ago			`pass`
			`else:`
			`try:`
			`# Attempt to download file`
			`r = requests.get(url, stream=True)`
			`except (ConnectionError, InvalidURL):`
			`logging.error(`
			`('Failed to download {4}\n'`
			`' Project: {1}\n'`
			`' Modification: {2}\n'`
			`' Folder: {3}\n').format(*document_path.split(os.sep)))`
			`return`
Add support for Windows file paths longer than 255 characters 6 years ago
Add progress bars 6 years ago			`# Write file to disk`
			`with open(document_path, 'wb') as f:`
			`shutil.copyfileobj(r.raw, f)`
Initial commit 6 years ago

			`def main():`
			`example_text = """examples:`

			`# Grab a single project modification using its job id, and save in 'files'`
Update docstrings 6 years ago			`major_projects_grabber -i 1746 -o files`
Initial commit 6 years ago
			`# Grab all modifications in search results page, and save in current folder`
Update URL in docstring 6 years ago			`major_projects_grabber -u "http://majorprojects.planning.nsw.gov.au/index.pl?action=search&authority_id=547"`
Initial commit 6 years ago			`"""`

			`# Set up command line arguments`
			`parser = argparse.ArgumentParser(`
			`epilog=example_text,`
			`formatter_class=argparse.RawDescriptionHelpFormatter)`
			`parser.add_argument(`
			`'-o', metavar='OUTPUT_DIR', default='.', help='root output directory')`
			`parser.add_argument(`
Add note when no documents are available on DPE portal 6 years ago			`'-i',`
			`metavar='ID',`
			`default=[],`
			`help='modification job id(s)',`
			`nargs='*')`
Initial commit 6 years ago			`parser.add_argument('-u', metavar='URL', help='url of search results page')`

			`# Print usage if no arguments are provided`
			`if len(sys.argv) == 1:`
			`parser.print_help(sys.stderr)`
			`sys.exit(1)`

			`# Parse arguments`
			`args = parser.parse_args()`
			`search_results_url = args.u`
			`output_dir = args.o`
			`mod_ids = args.i`

			`# Set up log File`
			`os.makedirs(output_dir, exist_ok=True)`
			`log_name = os.path.join(output_dir, 'errors.log')`
			`logging.basicConfig(filename=log_name, level=logging.ERROR)`

			`# Get mod IDs from search results`
			`if search_results_url:`
			`search_mod_ids = mod_ids_from_search(search_results_url)`
			`mod_ids.extend(search_mod_ids)`

Add progress bars 6 years ago			`# Get list of documents from given modification IDs`
			`mod_pbar = tqdm(mod_ids)`
			`for mod_id in mod_pbar:`
			`document_data = get_document_list(mod_id, output_dir)`

			`# Download documents for current modification`
			`doc_pbar = tqdm(document_data)`

			`for doc in doc_pbar:`
			`# Update progress bars`
			`mod_name = doc['document_path'].split(os.sep)[-3]`
			`doc_name = doc['document_path'].split(os.sep)[-1]`
			`mod_pbar.set_description(mod_name)`
			`doc_pbar.set_description(doc_name)`

			`# Download document`
			`download_document(doc['url'], doc['document_path'])`

			`# Tidy up console after tqdm`
			`print('\n')`
Initial commit 6 years ago
Tidy up log file when finished downloading 6 years ago			`# Stop logging`
			`logging.shutdown()`
Show warning if some files failed to download 6 years ago			`with open(log_name, 'r') as f:`
			`log_data = f.read()`

Tidy up log file when finished downloading 6 years ago			`# Check contents of log file`
Show warning if some files failed to download 6 years ago			`if log_data:`
			`warnings.warn(`
			`'Some files failed to download. See log for details.',`
			`stacklevel=2)`
Tidy up log file when finished downloading 6 years ago			`else:`
			`os.remove(log_name)`
Show warning if some files failed to download 6 years ago
Initial commit 6 years ago
			`if __name__ == '__main__':`
			`main()`