major-projects-grabber/major_projects_grabber/major_projects_grabber.py

"""major_projects_grabber.py
Download documents from the NSW DPE Major Projects website.

Example usage:
    # Grab a single project modification using its job id, and save in 'files'
    major_projects_grabber -o files -i 1746

    # Grab all modifications in search results page, and save in current folder
    major_projects_grabber "http://majorprojects.planning.nsw.gov.au/index.pl?action=search&authority_id=547"

"""

import os
import re
import sys
import shutil
import logging
import requests
import argparse
import pandas as pd
from lxml import html
from tqdm import tqdm
from requests.exceptions import ConnectionError, InvalidURL


def get_documents(mod_id, output_dir):
    """Download all documents from project modification ID
    """

    # Get html from mod page
    mod_url = ('http://majorprojects.planning.nsw.gov.au/'
               'index.pl?action=view_job&job_id=' + mod_id)
    mod_page = requests.get(mod_url)
    mod_tree = html.fromstring(mod_page.content)

    # Get mod details
    project_name = mod_tree.xpath(
        '//*[@class="vpa_header vpa_header_contrast"]/h2')[0].text
    mod_name = mod_tree.xpath(
        '//*[@class="vpa_header vpa_header_contrast"]/h1')[0].text

    # Get list of document folders
    folders = mod_tree.xpath('//div[@class="folder_row"]')

    # Create modification folder
    mod_dir = os.path.join(output_dir, project_name, mod_name)
    os.makedirs(mod_dir, exist_ok=True)

    # Add note if no documents are found on portal
    if not folders:
        txt_name = 'No documents on DPE portal for this modification.txt'
        open(os.path.join(mod_dir, txt_name), 'a').close()

    # Create link to DPE Major Projects page for current modification
    text = """<html>
              <meta http-equiv="refresh" content="0; url={}">
              </html>""".format(mod_url)
    with open(os.path.join(mod_dir, 'DPE-portal-page.html'), 'w') as f:
        f.write(text)

    for folder in folders:
        folder_name = folder.xpath('a[2]')[0].text.strip()

        # Get documents in current folder
        documents = folder.xpath('ul/li/a')
        for document in documents:
            document_url = document.get('href')
            document_name = document.text
            document_path = os.path.join(output_dir, project_name, mod_name,
                                         folder_name, document_name)

            # Create output directories as required
            try:
                os.makedirs(os.path.dirname(document_path), exist_ok=True)
            except OSError:
                logging.error(('Failed to download {}\n'
                               '  Project: {}\n'
                               '  Modification: {}\n'
                               '  Folder: {}\n').format(
                                   document_name, project_name, mod_name,
                                   folder_name))
                continue

            # Download document, if it does not already exist
            if os.path.isfile(document_path):
                pass
            else:
                try:
                    r = requests.get(document_url, stream=True)
                except (ConnectionError, InvalidURL):
                    logging.error(('Failed to download {}\n'
                                   '  Project: {}\n'
                                   '  Modification: {}\n'
                                   '  Folder: {}\n').format(
                                       document_name, project_name, mod_name,
                                       folder_name))
                    continue
                with open(document_path, 'wb') as f:
                    shutil.copyfileobj(r.raw, f)


def mod_ids_from_search(search_results_url):
    """Get modification job IDs from search results URL"""

    # Get HTML of search results page
    page = requests.get(search_results_url)

    # Create HTML tree
    tree = html.fromstring(page.content)

    # Find job ids of items in results list
    mod_ids = []
    mods = tree.xpath('//*[@class="vpa_list"]/tr/td[1]/a')
    for mod in mods:
        mod_ids.append(re.search('(?<=job_id=)\d+', mod.get('href')).group())

    return mod_ids


def main():
    example_text = """examples:

    # Grab a single project modification using its job id, and save in 'files'
    major_projects_grabber -i 1746 -o files

    # Grab all modifications in search results page, and save in current folder
    major_projects_grabber http://majorprojects.planning.nsw.gov.au/index.pl?action=search&authority_id=547
    """

    # Set up command line arguments
    parser = argparse.ArgumentParser(
        epilog=example_text,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument(
        '-o', metavar='OUTPUT_DIR', default='.', help='root output directory')
    parser.add_argument(
        '-i',
        metavar='ID',
        default=[],
        help='modification job id(s)',
        nargs='*')
    parser.add_argument('-u', metavar='URL', help='url of search results page')

    # Print usage if no arguments are provided
    if len(sys.argv) == 1:
        parser.print_help(sys.stderr)
        sys.exit(1)

    # Parse arguments
    args = parser.parse_args()
    search_results_url = args.u
    output_dir = args.o
    mod_ids = args.i

    # Set up log File
    os.makedirs(output_dir, exist_ok=True)
    log_name = os.path.join(output_dir, 'errors.log')
    logging.basicConfig(filename=log_name, level=logging.ERROR)

    # Get mod IDs from search results
    if search_results_url:
        search_mod_ids = mod_ids_from_search(search_results_url)
        mod_ids.extend(search_mod_ids)

    # Download documents from given modification ids
    for mod_id in tqdm(mod_ids):
        get_documents(mod_id, output_dir)


if __name__ == '__main__':
    main()
Initial commit 6 years ago			`"""major_projects_grabber.py`
			`Download documents from the NSW DPE Major Projects website.`

			`Example usage:`
			`# Grab a single project modification using its job id, and save in 'files'`
Update docstrings 6 years ago			`major_projects_grabber -o files -i 1746`
Initial commit 6 years ago
			`# Grab all modifications in search results page, and save in current folder`
Update docstrings 6 years ago			`major_projects_grabber "http://majorprojects.planning.nsw.gov.au/index.pl?action=search&authority_id=547"`
Initial commit 6 years ago
			`"""`

			`import os`
			`import re`
			`import sys`
			`import shutil`
			`import logging`
			`import requests`
			`import argparse`
			`import pandas as pd`
			`from lxml import html`
Update docstrings 6 years ago			`from tqdm import tqdm`
Initial commit 6 years ago			`from requests.exceptions import ConnectionError, InvalidURL`


			`def get_documents(mod_id, output_dir):`
			`"""Download all documents from project modification ID`
			`"""`

			`# Get html from mod page`
			`mod_url = ('http://majorprojects.planning.nsw.gov.au/'`
			`'index.pl?action=view_job&job_id=' + mod_id)`
			`mod_page = requests.get(mod_url)`
			`mod_tree = html.fromstring(mod_page.content)`

			`# Get mod details`
			`project_name = mod_tree.xpath(`
			`'//*[@class="vpa_header vpa_header_contrast"]/h2')[0].text`
			`mod_name = mod_tree.xpath(`
			`'//*[@class="vpa_header vpa_header_contrast"]/h1')[0].text`

			`# Get list of document folders`
			`folders = mod_tree.xpath('//div[@class="folder_row"]')`
Add note when no documents are available on DPE portal 6 years ago
			`# Create modification folder`
			`mod_dir = os.path.join(output_dir, project_name, mod_name)`
			`os.makedirs(mod_dir, exist_ok=True)`

			`# Add note if no documents are found on portal`
			`if not folders:`
			`txt_name = 'No documents on DPE portal for this modification.txt'`
Update docstrings 6 years ago			`open(os.path.join(mod_dir, txt_name), 'a').close()`
Add note when no documents are available on DPE portal 6 years ago
			`# Create link to DPE Major Projects page for current modification`
			`text = """<html>`
			`<meta http-equiv="refresh" content="0; url={}">`
			`</html>""".format(mod_url)`
			`with open(os.path.join(mod_dir, 'DPE-portal-page.html'), 'w') as f:`
			`f.write(text)`

Initial commit 6 years ago			`for folder in folders:`
			`folder_name = folder.xpath('a[2]')[0].text.strip()`

			`# Get documents in current folder`
			`documents = folder.xpath('ul/li/a')`
			`for document in documents:`
			`document_url = document.get('href')`
			`document_name = document.text`
			`document_path = os.path.join(output_dir, project_name, mod_name,`
			`folder_name, document_name)`

			`# Create output directories as required`
			`try:`
			`os.makedirs(os.path.dirname(document_path), exist_ok=True)`
			`except OSError:`
			`logging.error(('Failed to download {}\n'`
			`' Project: {}\n'`
			`' Modification: {}\n'`
			`' Folder: {}\n').format(`
			`document_name, project_name, mod_name,`
			`folder_name))`
			`continue`

			`# Download document, if it does not already exist`
			`if os.path.isfile(document_path):`
			`pass`
			`else:`
			`try:`
			`r = requests.get(document_url, stream=True)`
			`except (ConnectionError, InvalidURL):`
			`logging.error(('Failed to download {}\n'`
			`' Project: {}\n'`
			`' Modification: {}\n'`
			`' Folder: {}\n').format(`
			`document_name, project_name, mod_name,`
			`folder_name))`
			`continue`
			`with open(document_path, 'wb') as f:`
			`shutil.copyfileobj(r.raw, f)`


			`def mod_ids_from_search(search_results_url):`
			`"""Get modification job IDs from search results URL"""`

			`# Get HTML of search results page`
			`page = requests.get(search_results_url)`

			`# Create HTML tree`
			`tree = html.fromstring(page.content)`

			`# Find job ids of items in results list`
			`mod_ids = []`
			`mods = tree.xpath('//*[@class="vpa_list"]/tr/td[1]/a')`
			`for mod in mods:`
			`mod_ids.append(re.search('(?<=job_id=)\d+', mod.get('href')).group())`

			`return mod_ids`


			`def main():`
			`example_text = """examples:`

			`# Grab a single project modification using its job id, and save in 'files'`
Update docstrings 6 years ago			`major_projects_grabber -i 1746 -o files`
Initial commit 6 years ago
			`# Grab all modifications in search results page, and save in current folder`
Update docstrings 6 years ago			`major_projects_grabber http://majorprojects.planning.nsw.gov.au/index.pl?action=search&authority_id=547`
Initial commit 6 years ago			`"""`

			`# Set up command line arguments`
			`parser = argparse.ArgumentParser(`
			`epilog=example_text,`
			`formatter_class=argparse.RawDescriptionHelpFormatter)`
			`parser.add_argument(`
			`'-o', metavar='OUTPUT_DIR', default='.', help='root output directory')`
			`parser.add_argument(`
Add note when no documents are available on DPE portal 6 years ago			`'-i',`
			`metavar='ID',`
			`default=[],`
			`help='modification job id(s)',`
			`nargs='*')`
Initial commit 6 years ago			`parser.add_argument('-u', metavar='URL', help='url of search results page')`

			`# Print usage if no arguments are provided`
			`if len(sys.argv) == 1:`
			`parser.print_help(sys.stderr)`
			`sys.exit(1)`

			`# Parse arguments`
			`args = parser.parse_args()`
			`search_results_url = args.u`
			`output_dir = args.o`
			`mod_ids = args.i`

			`# Set up log File`
			`os.makedirs(output_dir, exist_ok=True)`
			`log_name = os.path.join(output_dir, 'errors.log')`
			`logging.basicConfig(filename=log_name, level=logging.ERROR)`

			`# Get mod IDs from search results`
			`if search_results_url:`
			`search_mod_ids = mod_ids_from_search(search_results_url)`
			`mod_ids.extend(search_mod_ids)`

			`# Download documents from given modification ids`
Update docstrings 6 years ago			`for mod_id in tqdm(mod_ids):`
Initial commit 6 years ago			`get_documents(mod_id, output_dir)`


			`if __name__ == '__main__':`
			`main()`