major-projects-grabber/major_projects_grabber/major_projects_grabber.py

"""major_projects_grabber.py
Download documents from the NSW DPE Major Projects website.

Example usage:
    # Grab a single project modification using its job id, and save in 'files'
    major_projects_grabber -o files -i 1746

    # Grab all modifications in search results page, and save in current folder
    major_projects_grabber "http://majorprojects.planning.nsw.gov.au/index.pl?action=search&authority_id=547"

"""

import os
import re
import sys
import shutil
import logging
import requests
import argparse
import pandas as pd
from lxml import html
from tqdm import tqdm
from requests.exceptions import ConnectionError, InvalidURL


def get_documents(mod_id, output_dir):
    """Download all documents from project modification ID
    """

    # Get html from mod page
    mod_url = ('http://majorprojects.planning.nsw.gov.au/'
               'index.pl?action=view_job&job_id=' + mod_id)
    mod_page = requests.get(mod_url)
    mod_tree = html.fromstring(mod_page.content)

    # Get mod details
    project_name = mod_tree.xpath(
        '//*[@class="vpa_header vpa_header_contrast"]/h2')[0].text
    mod_name = mod_tree.xpath(
        '//*[@class="vpa_header vpa_header_contrast"]/h1')[0].text

    # Get list of document folders
    folders = mod_tree.xpath('//div[@class="folder_row"]')

    # Create modification folder
    mod_dir = os.path.join(output_dir, project_name, mod_name)
    os.makedirs(mod_dir, exist_ok=True)

    # Add note if no documents are found on portal
    if not folders:
        txt_name = 'No documents on DPE portal for this modification.txt'
        open(os.path.join(mod_dir, txt_name), 'a').close()

    # Create link to DPE Major Projects page for current modification
    text = """<html>
              <meta http-equiv="refresh" content="0; url={}">
              </html>""".format(mod_url)
    with open(os.path.join(mod_dir, 'DPE-portal-page.html'), 'w') as f:
        f.write(text)

    for folder in folders:
        folder_name = folder.xpath('a[2]')[0].text.strip()

        # Get documents in current folder
        documents = folder.xpath('ul/li/a')
        for document in documents:
            document_url = document.get('href')
            document_name = document.text
            document_path = os.path.join(output_dir, project_name, mod_name,
                                         folder_name, document_name)

            # Create output directories as required
            try:
                os.makedirs(os.path.dirname(document_path), exist_ok=True)
            except OSError:
                logging.error(('Failed to download {}\n'
                               '  Project: {}\n'
                               '  Modification: {}\n'
                               '  Folder: {}\n').format(
                                   document_name, project_name, mod_name,
                                   folder_name))
                continue

            # Download document, if it does not already exist
            if os.path.isfile(document_path):
                pass
            else:
                try:
                    r = requests.get(document_url, stream=True)
                except (ConnectionError, InvalidURL):
                    logging.error(('Failed to download {}\n'
                                   '  Project: {}\n'
                                   '  Modification: {}\n'
                                   '  Folder: {}\n').format(
                                       document_name, project_name, mod_name,
                                       folder_name))
                    continue
                with open(document_path, 'wb') as f:
                    shutil.copyfileobj(r.raw, f)


def mod_ids_from_search(search_results_url):
    """Get modification job IDs from search results URL"""

    # Get HTML of search results page
    page = requests.get(search_results_url)

    # Create HTML tree
    tree = html.fromstring(page.content)

    # Find job ids of items in results list
    mod_ids = []
    mods = tree.xpath('//*[@class="vpa_list"]/tr/td[1]/a')
    for mod in mods:
        mod_ids.append(re.search('(?<=job_id=)\d+', mod.get('href')).group())

    return mod_ids


def main():
    example_text = """examples:

    # Grab a single project modification using its job id, and save in 'files'
    major_projects_grabber -i 1746 -o files

    # Grab all modifications in search results page, and save in current folder
    major_projects_grabber http://majorprojects.planning.nsw.gov.au/index.pl?action=search&authority_id=547
    """

    # Set up command line arguments
    parser = argparse.ArgumentParser(
        epilog=example_text,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument(
        '-o', metavar='OUTPUT_DIR', default='.', help='root output directory')
    parser.add_argument(
        '-i',
        metavar='ID',
        default=[],
        help='modification job id(s)',
        nargs='*')
    parser.add_argument('-u', metavar='URL', help='url of search results page')

    # Print usage if no arguments are provided
    if len(sys.argv) == 1:
        parser.print_help(sys.stderr)
        sys.exit(1)

    # Parse arguments
    args = parser.parse_args()
    search_results_url = args.u
    output_dir = args.o
    mod_ids = args.i

    # Set up log File
    os.makedirs(output_dir, exist_ok=True)
    log_name = os.path.join(output_dir, 'errors.log')
    logging.basicConfig(filename=log_name, level=logging.ERROR)

    # Get mod IDs from search results
    if search_results_url:
        search_mod_ids = mod_ids_from_search(search_results_url)
        mod_ids.extend(search_mod_ids)

    # Download documents from given modification ids
    for mod_id in tqdm(mod_ids):
        get_documents(mod_id, output_dir)


if __name__ == '__main__':
    main()