major-projects-grabber/major_projects_grabber/major_projects_grabber.py

"""major_projects_grabber.py
Download documents from the NSW DPE Major Projects website.

Example usage:
    # Grab a single project modification using its job id, and save in 'files'
    python major_projects_grabber.py -o files -i 1019

    # Grab all modifications in search results page, and save in current folder
    python major_projects_grabber.py -o . http://

"""

# search url
# http://majorprojects.planning.nsw.gov.au/index.pl?action=search&page_id=&search=&authority_id=&search_site_type_id=10&reference_table=&status_id=&decider=&from_date=&to_date=&x=31&y=16

# mod url
# http://majorprojects.planning.nsw.gov.au/index.pl?action=view_job&job_id=9503

import os
import re
import sys
import shutil
import logging
import requests
import argparse
import pandas as pd
from lxml import html
from requests.exceptions import ConnectionError, InvalidURL


def get_documents(mod_id, output_dir):
    """Download all documents from project modification ID
    """

    # Get html from mod page
    mod_url = ('http://majorprojects.planning.nsw.gov.au/'
               'index.pl?action=view_job&job_id=' + mod_id)
    mod_page = requests.get(mod_url)
    mod_tree = html.fromstring(mod_page.content)

    # Get mod details
    project_name = mod_tree.xpath(
        '//*[@class="vpa_header vpa_header_contrast"]/h2')[0].text
    mod_name = mod_tree.xpath(
        '//*[@class="vpa_header vpa_header_contrast"]/h1')[0].text

    # Get list of document folders
    folders = mod_tree.xpath('//div[@class="folder_row"]')

    # Create modification folder
    mod_dir = os.path.join(output_dir, project_name, mod_name)
    os.makedirs(mod_dir, exist_ok=True)

    # Add note if no documents are found on portal
    if not folders:
        txt_name = 'No documents on DPE portal for this modification.txt'
        open(os.path.join(mod_dir,txt_name), 'a').close()

    # Create link to DPE Major Projects page for current modification
    text = """<html>
              <meta http-equiv="refresh" content="0; url={}">
              </html>""".format(mod_url)
    with open(os.path.join(mod_dir, 'DPE-portal-page.html'), 'w') as f:
        f.write(text)

    for folder in folders:
        folder_name = folder.xpath('a[2]')[0].text.strip()

        # Get documents in current folder
        documents = folder.xpath('ul/li/a')
        for document in documents:
            document_url = document.get('href')
            document_name = document.text
            document_path = os.path.join(output_dir, project_name, mod_name,
                                         folder_name, document_name)

            # Create output directories as required
            try:
                os.makedirs(os.path.dirname(document_path), exist_ok=True)
            except OSError:
                logging.error(('Failed to download {}\n'
                               '  Project: {}\n'
                               '  Modification: {}\n'
                               '  Folder: {}\n').format(
                                   document_name, project_name, mod_name,
                                   folder_name))
                continue

            # Download document, if it does not already exist
            if os.path.isfile(document_path):
                pass
            else:
                try:
                    r = requests.get(document_url, stream=True)
                except (ConnectionError, InvalidURL):
                    logging.error(('Failed to download {}\n'
                                   '  Project: {}\n'
                                   '  Modification: {}\n'
                                   '  Folder: {}\n').format(
                                       document_name, project_name, mod_name,
                                       folder_name))
                    continue
                with open(document_path, 'wb') as f:
                    shutil.copyfileobj(r.raw, f)


def mod_ids_from_search(search_results_url):
    """Get modification job IDs from search results URL"""

    # Get HTML of search results page
    page = requests.get(search_results_url)

    # Create HTML tree
    tree = html.fromstring(page.content)

    # Find job ids of items in results list
    mod_ids = []
    mods = tree.xpath('//*[@class="vpa_list"]/tr/td[1]/a')
    for mod in mods:
        mod_ids.append(re.search('(?<=job_id=)\d+', mod.get('href')).group())

    return mod_ids


def main():
    example_text = """examples:

    # Grab a single project modification using its job id, and save in 'files'
    python major_projects_grabber.py -o files -i 1019

    # Grab all modifications in search results page, and save in current folder
    python major_projects_grabber.py -o . http://
    """

    # Set up command line arguments
    parser = argparse.ArgumentParser(
        epilog=example_text,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument(
        '-o', metavar='OUTPUT_DIR', default='.', help='root output directory')
    parser.add_argument(
        '-i',
        metavar='ID',
        default=[],
        help='modification job id(s)',
        nargs='*')
    parser.add_argument('-u', metavar='URL', help='url of search results page')

    # Print usage if no arguments are provided
    if len(sys.argv) == 1:
        parser.print_help(sys.stderr)
        sys.exit(1)

    # Parse arguments
    args = parser.parse_args()
    search_results_url = args.u
    output_dir = args.o
    mod_ids = args.i

    # Set up log File
    os.makedirs(output_dir, exist_ok=True)
    log_name = os.path.join(output_dir, 'errors.log')
    logging.basicConfig(filename=log_name, level=logging.ERROR)

    # Get mod IDs from search results
    if search_results_url:
        search_mod_ids = mod_ids_from_search(search_results_url)
        mod_ids.extend(search_mod_ids)

    # Download documents from given modification ids
    for mod_id in mod_ids:
        get_documents(mod_id, output_dir)


if __name__ == '__main__':
    main()