From ee718fd7c3e2583229ad13bd54ac3cc948fc27cf Mon Sep 17 00:00:00 2001
From: Dan Howe <d.howe@wrl.unsw.edu.au>
Date: Fri, 17 Aug 2018 14:55:51 +1000
Subject: [PATCH] Initial commit

---
 major_projects_grabber/__init__.py            |   1 +
 .../major_projects_grabber.py                 | 155 ++++++++++++++++++
 setup.py                                      |  15 ++
 3 files changed, 171 insertions(+)
 create mode 100644 major_projects_grabber/__init__.py
 create mode 100644 major_projects_grabber/major_projects_grabber.py
 create mode 100644 setup.py

diff --git a/major_projects_grabber/__init__.py b/major_projects_grabber/__init__.py
new file mode 100644
index 0000000..59c70d1
--- /dev/null
+++ b/major_projects_grabber/__init__.py
@@ -0,0 +1 @@
+from .major_projects_grabber import main
diff --git a/major_projects_grabber/major_projects_grabber.py b/major_projects_grabber/major_projects_grabber.py
new file mode 100644
index 0000000..c82a149
--- /dev/null
+++ b/major_projects_grabber/major_projects_grabber.py
@@ -0,0 +1,155 @@
+"""major_projects_grabber.py
+Download documents from the NSW DPE Major Projects website.
+
+Example usage:
+    # Grab a single project modification using its job id, and save in 'files'
+    python major_projects_grabber.py -o files -i 1019
+
+    # Grab all modifications in search results page, and save in current folder
+    python major_projects_grabber.py -o . http://
+
+"""
+
+# search url
+# http://majorprojects.planning.nsw.gov.au/index.pl?action=search&page_id=&search=&authority_id=&search_site_type_id=10&reference_table=&status_id=&decider=&from_date=&to_date=&x=31&y=16
+
+# mod url
+# http://majorprojects.planning.nsw.gov.au/index.pl?action=view_job&job_id=9503
+
+import os
+import re
+import sys
+import shutil
+import logging
+import requests
+import argparse
+import pandas as pd
+from lxml import html
+from requests.exceptions import ConnectionError, InvalidURL
+
+
+def get_documents(mod_id, output_dir):
+    """Download all documents from project modification ID
+    """
+
+    # Get html from mod page
+    mod_url = ('http://majorprojects.planning.nsw.gov.au/'
+               'index.pl?action=view_job&job_id=' + mod_id)
+    mod_page = requests.get(mod_url)
+    mod_tree = html.fromstring(mod_page.content)
+
+    # Get mod details
+    project_name = mod_tree.xpath(
+        '//*[@class="vpa_header vpa_header_contrast"]/h2')[0].text
+    mod_name = mod_tree.xpath(
+        '//*[@class="vpa_header vpa_header_contrast"]/h1')[0].text
+
+    # Get list of document folders
+    folders = mod_tree.xpath('//div[@class="folder_row"]')
+    for folder in folders:
+        folder_name = folder.xpath('a[2]')[0].text.strip()
+
+        # Get documents in current folder
+        documents = folder.xpath('ul/li/a')
+        for document in documents:
+            document_url = document.get('href')
+            document_name = document.text
+            document_path = os.path.join(output_dir, project_name, mod_name,
+                                         folder_name, document_name)
+
+            # Create output directories as required
+            try:
+                os.makedirs(os.path.dirname(document_path), exist_ok=True)
+            except OSError:
+                logging.error(('Failed to download {}\n'
+                               '  Project: {}\n'
+                               '  Modification: {}\n'
+                               '  Folder: {}\n').format(
+                                   document_name, project_name, mod_name,
+                                   folder_name))
+                continue
+
+            # Download document, if it does not already exist
+            if os.path.isfile(document_path):
+                pass
+            else:
+                try:
+                    r = requests.get(document_url, stream=True)
+                except (ConnectionError, InvalidURL):
+                    logging.error(('Failed to download {}\n'
+                                   '  Project: {}\n'
+                                   '  Modification: {}\n'
+                                   '  Folder: {}\n').format(
+                                       document_name, project_name, mod_name,
+                                       folder_name))
+                    continue
+                with open(document_path, 'wb') as f:
+                    shutil.copyfileobj(r.raw, f)
+
+
+def mod_ids_from_search(search_results_url):
+    """Get modification job IDs from search results URL"""
+
+    # Get HTML of search results page
+    page = requests.get(search_results_url)
+
+    # Create HTML tree
+    tree = html.fromstring(page.content)
+
+    # Find job ids of items in results list
+    mod_ids = []
+    mods = tree.xpath('//*[@class="vpa_list"]/tr/td[1]/a')
+    for mod in mods:
+        mod_ids.append(re.search('(?<=job_id=)\d+', mod.get('href')).group())
+
+    return mod_ids
+
+
+def main():
+    example_text = """examples:
+
+    # Grab a single project modification using its job id, and save in 'files'
+    python major_projects_grabber.py -o files -i 1019
+
+    # Grab all modifications in search results page, and save in current folder
+    python major_projects_grabber.py -o . http://
+    """
+
+    # Set up command line arguments
+    parser = argparse.ArgumentParser(
+        epilog=example_text,
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument(
+        '-o', metavar='OUTPUT_DIR', default='.', help='root output directory')
+    parser.add_argument(
+        '-i', metavar='ID', default=[], help='modification job id(s)', nargs='*')
+    parser.add_argument('-u', metavar='URL', help='url of search results page')
+
+    # Print usage if no arguments are provided
+    if len(sys.argv) == 1:
+        parser.print_help(sys.stderr)
+        sys.exit(1)
+
+    # Parse arguments
+    args = parser.parse_args()
+    search_results_url = args.u
+    output_dir = args.o
+    mod_ids = args.i
+
+    # Set up log File
+    os.makedirs(output_dir, exist_ok=True)
+    log_name = os.path.join(output_dir, 'errors.log')
+    logging.basicConfig(filename=log_name, level=logging.ERROR)
+
+    # Get mod IDs from search results
+    if search_results_url:
+        search_mod_ids = mod_ids_from_search(search_results_url)
+        mod_ids.extend(search_mod_ids)
+
+    # Download documents from given modification ids
+    for mod_id in mod_ids:
+        get_documents(mod_id, output_dir)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..c01b0b6
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,15 @@
+from setuptools import setup
+
+setup(
+    name='major_projects_grabber',
+    version='0.1.0',
+    packages=['major_projects_grabber'],
+    install_requires=['requests', 'pandas', 'lxml'],
+    entry_points={
+        'console_scripts': [
+            'major_projects_grabber = major_projects_grabber:main'
+        ]
+    },
+    author='Dan Howe',
+    author_email='d.howe@wrl.unsw.edu.au',
+    description='Download documents from the NSW DPE Major Projects website.')