major-projects-grabber/major_projects_grabber/major_projects_grabber.py

"""major_projects_grabber.py
Download documents from the NSW DPE Major Projects website.

Example usage:
    # Grab a single project modification using its job id, and save in 'files'
    python major_projects_grabber.py -o files -i 1019

    # Grab all modifications in search results page, and save in current folder
    python major_projects_grabber.py -o . http://

"""

# search url
# http://majorprojects.planning.nsw.gov.au/index.pl?action=search&page_id=&search=&authority_id=&search_site_type_id=10&reference_table=&status_id=&decider=&from_date=&to_date=&x=31&y=16

# mod url
# http://majorprojects.planning.nsw.gov.au/index.pl?action=view_job&job_id=9503

import os
import re
import sys
import shutil
import logging
import requests
import argparse
import pandas as pd
from lxml import html
from requests.exceptions import ConnectionError, InvalidURL


def get_documents(mod_id, output_dir):
    """Download all documents from project modification ID
    """

    # Get html from mod page
    mod_url = ('http://majorprojects.planning.nsw.gov.au/'
               'index.pl?action=view_job&job_id=' + mod_id)
    mod_page = requests.get(mod_url)
    mod_tree = html.fromstring(mod_page.content)

    # Get mod details
    project_name = mod_tree.xpath(
        '//*[@class="vpa_header vpa_header_contrast"]/h2')[0].text
    mod_name = mod_tree.xpath(
        '//*[@class="vpa_header vpa_header_contrast"]/h1')[0].text

    # Get list of document folders
    folders = mod_tree.xpath('//div[@class="folder_row"]')
    for folder in folders:
        folder_name = folder.xpath('a[2]')[0].text.strip()

        # Get documents in current folder
        documents = folder.xpath('ul/li/a')
        for document in documents:
            document_url = document.get('href')
            document_name = document.text
            document_path = os.path.join(output_dir, project_name, mod_name,
                                         folder_name, document_name)

            # Create output directories as required
            try:
                os.makedirs(os.path.dirname(document_path), exist_ok=True)
            except OSError:
                logging.error(('Failed to download {}\n'
                               '  Project: {}\n'
                               '  Modification: {}\n'
                               '  Folder: {}\n').format(
                                   document_name, project_name, mod_name,
                                   folder_name))
                continue

            # Download document, if it does not already exist
            if os.path.isfile(document_path):
                pass
            else:
                try:
                    r = requests.get(document_url, stream=True)
                except (ConnectionError, InvalidURL):
                    logging.error(('Failed to download {}\n'
                                   '  Project: {}\n'
                                   '  Modification: {}\n'
                                   '  Folder: {}\n').format(
                                       document_name, project_name, mod_name,
                                       folder_name))
                    continue
                with open(document_path, 'wb') as f:
                    shutil.copyfileobj(r.raw, f)


def mod_ids_from_search(search_results_url):
    """Get modification job IDs from search results URL"""

    # Get HTML of search results page
    page = requests.get(search_results_url)

    # Create HTML tree
    tree = html.fromstring(page.content)

    # Find job ids of items in results list
    mod_ids = []
    mods = tree.xpath('//*[@class="vpa_list"]/tr/td[1]/a')
    for mod in mods:
        mod_ids.append(re.search('(?<=job_id=)\d+', mod.get('href')).group())

    return mod_ids


def main():
    example_text = """examples:

    # Grab a single project modification using its job id, and save in 'files'
    python major_projects_grabber.py -o files -i 1019

    # Grab all modifications in search results page, and save in current folder
    python major_projects_grabber.py -o . http://
    """

    # Set up command line arguments
    parser = argparse.ArgumentParser(
        epilog=example_text,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument(
        '-o', metavar='OUTPUT_DIR', default='.', help='root output directory')
    parser.add_argument(
        '-i', metavar='ID', default=[], help='modification job id(s)', nargs='*')
    parser.add_argument('-u', metavar='URL', help='url of search results page')

    # Print usage if no arguments are provided
    if len(sys.argv) == 1:
        parser.print_help(sys.stderr)
        sys.exit(1)

    # Parse arguments
    args = parser.parse_args()
    search_results_url = args.u
    output_dir = args.o
    mod_ids = args.i

    # Set up log File
    os.makedirs(output_dir, exist_ok=True)
    log_name = os.path.join(output_dir, 'errors.log')
    logging.basicConfig(filename=log_name, level=logging.ERROR)

    # Get mod IDs from search results
    if search_results_url:
        search_mod_ids = mod_ids_from_search(search_results_url)
        mod_ids.extend(search_mod_ids)

    # Download documents from given modification ids
    for mod_id in mod_ids:
        get_documents(mod_id, output_dir)


if __name__ == '__main__':
    main()
Initial commit 6 years ago			`"""major_projects_grabber.py`
			`Download documents from the NSW DPE Major Projects website.`

			`Example usage:`
			`# Grab a single project modification using its job id, and save in 'files'`
			`python major_projects_grabber.py -o files -i 1019`

			`# Grab all modifications in search results page, and save in current folder`
			`python major_projects_grabber.py -o . http://`

			`"""`

			`# search url`
			`# http://majorprojects.planning.nsw.gov.au/index.pl?action=search&page_id=&search=&authority_id=&search_site_type_id=10&reference_table=&status_id=&decider=&from_date=&to_date=&x=31&y=16`

			`# mod url`
			`# http://majorprojects.planning.nsw.gov.au/index.pl?action=view_job&job_id=9503`

			`import os`
			`import re`
			`import sys`
			`import shutil`
			`import logging`
			`import requests`
			`import argparse`
			`import pandas as pd`
			`from lxml import html`
			`from requests.exceptions import ConnectionError, InvalidURL`


			`def get_documents(mod_id, output_dir):`
			`"""Download all documents from project modification ID`
			`"""`

			`# Get html from mod page`
			`mod_url = ('http://majorprojects.planning.nsw.gov.au/'`
			`'index.pl?action=view_job&job_id=' + mod_id)`
			`mod_page = requests.get(mod_url)`
			`mod_tree = html.fromstring(mod_page.content)`

			`# Get mod details`
			`project_name = mod_tree.xpath(`
			`'//*[@class="vpa_header vpa_header_contrast"]/h2')[0].text`
			`mod_name = mod_tree.xpath(`
			`'//*[@class="vpa_header vpa_header_contrast"]/h1')[0].text`

			`# Get list of document folders`
			`folders = mod_tree.xpath('//div[@class="folder_row"]')`
			`for folder in folders:`
			`folder_name = folder.xpath('a[2]')[0].text.strip()`

			`# Get documents in current folder`
			`documents = folder.xpath('ul/li/a')`
			`for document in documents:`
			`document_url = document.get('href')`
			`document_name = document.text`
			`document_path = os.path.join(output_dir, project_name, mod_name,`
			`folder_name, document_name)`

			`# Create output directories as required`
			`try:`
			`os.makedirs(os.path.dirname(document_path), exist_ok=True)`
			`except OSError:`
			`logging.error(('Failed to download {}\n'`
			`' Project: {}\n'`
			`' Modification: {}\n'`
			`' Folder: {}\n').format(`
			`document_name, project_name, mod_name,`
			`folder_name))`
			`continue`

			`# Download document, if it does not already exist`
			`if os.path.isfile(document_path):`
			`pass`
			`else:`
			`try:`
			`r = requests.get(document_url, stream=True)`
			`except (ConnectionError, InvalidURL):`
			`logging.error(('Failed to download {}\n'`
			`' Project: {}\n'`
			`' Modification: {}\n'`
			`' Folder: {}\n').format(`
			`document_name, project_name, mod_name,`
			`folder_name))`
			`continue`
			`with open(document_path, 'wb') as f:`
			`shutil.copyfileobj(r.raw, f)`


			`def mod_ids_from_search(search_results_url):`
			`"""Get modification job IDs from search results URL"""`

			`# Get HTML of search results page`
			`page = requests.get(search_results_url)`

			`# Create HTML tree`
			`tree = html.fromstring(page.content)`

			`# Find job ids of items in results list`
			`mod_ids = []`
			`mods = tree.xpath('//*[@class="vpa_list"]/tr/td[1]/a')`
			`for mod in mods:`
			`mod_ids.append(re.search('(?<=job_id=)\d+', mod.get('href')).group())`

			`return mod_ids`


			`def main():`
			`example_text = """examples:`

			`# Grab a single project modification using its job id, and save in 'files'`
			`python major_projects_grabber.py -o files -i 1019`

			`# Grab all modifications in search results page, and save in current folder`
			`python major_projects_grabber.py -o . http://`
			`"""`

			`# Set up command line arguments`
			`parser = argparse.ArgumentParser(`
			`epilog=example_text,`
			`formatter_class=argparse.RawDescriptionHelpFormatter)`
			`parser.add_argument(`
			`'-o', metavar='OUTPUT_DIR', default='.', help='root output directory')`
			`parser.add_argument(`
			`'-i', metavar='ID', default=[], help='modification job id(s)', nargs='*')`
			`parser.add_argument('-u', metavar='URL', help='url of search results page')`

			`# Print usage if no arguments are provided`
			`if len(sys.argv) == 1:`
			`parser.print_help(sys.stderr)`
			`sys.exit(1)`

			`# Parse arguments`
			`args = parser.parse_args()`
			`search_results_url = args.u`
			`output_dir = args.o`
			`mod_ids = args.i`

			`# Set up log File`
			`os.makedirs(output_dir, exist_ok=True)`
			`log_name = os.path.join(output_dir, 'errors.log')`
			`logging.basicConfig(filename=log_name, level=logging.ERROR)`

			`# Get mod IDs from search results`
			`if search_results_url:`
			`search_mod_ids = mod_ids_from_search(search_results_url)`
			`mod_ids.extend(search_mod_ids)`

			`# Download documents from given modification ids`
			`for mod_id in mod_ids:`
			`get_documents(mod_id, output_dir)`


			`if __name__ == '__main__':`
			`main()`