Initial commit

8 years ago · ee718fd7c3
commit ee718fd7c3
3 changed files with 171 additions and 0 deletions
--- a/major_projects_grabber/init.py
+++ b/major_projects_grabber/init.py
@ -0,0 +1 @@
 from .major_projects_grabber import main
--- a/major_projects_grabber/major_projects_grabber.py
+++ b/major_projects_grabber/major_projects_grabber.py
@ -0,0 +1,155 @@
 """major_projects_grabber.py
 Download documents from the NSW DPE Major Projects website.
 Example usage:
    # Grab a single project modification using its job id, and save in 'files'
    python major_projects_grabber.py -o files -i 1019
    # Grab all modifications in search results page, and save in current folder
    python major_projects_grabber.py -o . http://
 """
 # search url
 # http://majorprojects.planning.nsw.gov.au/index.pl?action=search&page_id=&search=&authority_id=&search_site_type_id=10&reference_table=&status_id=&decider=&from_date=&to_date=&x=31&y=16
 # mod url
 # http://majorprojects.planning.nsw.gov.au/index.pl?action=view_job&job_id=9503
 import os
 import re
 import sys
 import shutil
 import logging
 import requests
 import argparse
 import pandas as pd
 from lxml import html
 from requests.exceptions import ConnectionError, InvalidURL
 def get_documents(mod_id, output_dir):
    """Download all documents from project modification ID
    """
    # Get html from mod page
    mod_url = ('http://majorprojects.planning.nsw.gov.au/'
               'index.pl?action=view_job&job_id=' + mod_id)
    mod_page = requests.get(mod_url)
    mod_tree = html.fromstring(mod_page.content)
    # Get mod details
    project_name = mod_tree.xpath(
        '//*[@class="vpa_header vpa_header_contrast"]/h2')[0].text
    mod_name = mod_tree.xpath(
        '//*[@class="vpa_header vpa_header_contrast"]/h1')[0].text
    # Get list of document folders
    folders = mod_tree.xpath('//div[@class="folder_row"]')
    for folder in folders:
        folder_name = folder.xpath('a[2]')[0].text.strip()
        # Get documents in current folder
        documents = folder.xpath('ul/li/a')
        for document in documents:
            document_url = document.get('href')
            document_name = document.text
            document_path = os.path.join(output_dir, project_name, mod_name,
                                         folder_name, document_name)
            # Create output directories as required
            try:
                os.makedirs(os.path.dirname(document_path), exist_ok=True)
            except OSError:
                logging.error(('Failed to download {}\n'
                               '  Project: {}\n'
                               '  Modification: {}\n'
                               '  Folder: {}\n').format(
                                   document_name, project_name, mod_name,
                                   folder_name))
                continue
            # Download document, if it does not already exist
            if os.path.isfile(document_path):
                pass
            else:
                try:
                    r = requests.get(document_url, stream=True)
                except (ConnectionError, InvalidURL):
                    logging.error(('Failed to download {}\n'
                                   '  Project: {}\n'
                                   '  Modification: {}\n'
                                   '  Folder: {}\n').format(
                                       document_name, project_name, mod_name,
                                       folder_name))
                    continue
                with open(document_path, 'wb') as f:
                    shutil.copyfileobj(r.raw, f)
 def mod_ids_from_search(search_results_url):
    """Get modification job IDs from search results URL"""
    # Get HTML of search results page
    page = requests.get(search_results_url)
    # Create HTML tree
    tree = html.fromstring(page.content)
    # Find job ids of items in results list
    mod_ids = []
    mods = tree.xpath('//*[@class="vpa_list"]/tr/td[1]/a')
    for mod in mods:
        mod_ids.append(re.search('(?<=job_id=)\d+', mod.get('href')).group())
    return mod_ids
 def main():
    example_text = """examples:
    # Grab a single project modification using its job id, and save in 'files'
    python major_projects_grabber.py -o files -i 1019
    # Grab all modifications in search results page, and save in current folder
    python major_projects_grabber.py -o . http://
    """
    # Set up command line arguments
    parser = argparse.ArgumentParser(
        epilog=example_text,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument(
        '-o', metavar='OUTPUT_DIR', default='.', help='root output directory')
    parser.add_argument(
        '-i', metavar='ID', default=[], help='modification job id(s)', nargs='*')
    parser.add_argument('-u', metavar='URL', help='url of search results page')
    # Print usage if no arguments are provided
    if len(sys.argv) == 1:
        parser.print_help(sys.stderr)
        sys.exit(1)
    # Parse arguments
    args = parser.parse_args()
    search_results_url = args.u
    output_dir = args.o
    mod_ids = args.i
    # Set up log File
    os.makedirs(output_dir, exist_ok=True)
    log_name = os.path.join(output_dir, 'errors.log')
    logging.basicConfig(filename=log_name, level=logging.ERROR)
    # Get mod IDs from search results
    if search_results_url:
        search_mod_ids = mod_ids_from_search(search_results_url)
        mod_ids.extend(search_mod_ids)
    # Download documents from given modification ids
    for mod_id in mod_ids:
        get_documents(mod_id, output_dir)
 if __name__ == '__main__':
    main()
--- a/setup.py
+++ b/setup.py
@ -0,0 +1,15 @@
 from setuptools import setup
 setup(
    name='major_projects_grabber',
    version='0.1.0',
    packages=['major_projects_grabber'],
    install_requires=['requests', 'pandas', 'lxml'],
    entry_points={
        'console_scripts': [
            'major_projects_grabber = major_projects_grabber:main'
        ]
    },
    author='Dan Howe',
    author_email='d.howe@wrl.unsw.edu.au',
    description='Download documents from the NSW DPE Major Projects website.')