From e9f99f54de36769bb20ea844c410f814f4e008b2 Mon Sep 17 00:00:00 2001
From: Dan Howe <d.howe@wrl.unsw.edu.au>
Date: Fri, 17 Aug 2018 15:56:04 +1000
Subject: [PATCH] Add progress bars

---
 .../major_projects_grabber.py                 | 139 ++++++++++--------
 1 file changed, 81 insertions(+), 58 deletions(-)

diff --git a/major_projects_grabber/major_projects_grabber.py b/major_projects_grabber/major_projects_grabber.py
index d449d64..3d11040 100644
--- a/major_projects_grabber/major_projects_grabber.py
+++ b/major_projects_grabber/major_projects_grabber.py
@@ -17,15 +17,31 @@ import shutil
 import logging
 import requests
 import argparse
-import pandas as pd
 from lxml import html
 from tqdm import tqdm
 from requests.exceptions import ConnectionError, InvalidURL
 
 
-def get_documents(mod_id, output_dir):
-    """Download all documents from project modification ID
-    """
+def mod_ids_from_search(search_results_url):
+    """Get modification job IDs from search results URL"""
+
+    # Get HTML of search results page
+    page = requests.get(search_results_url)
+
+    # Create HTML tree
+    tree = html.fromstring(page.content)
+
+    # Find job ids of items in results list
+    mod_ids = []
+    mods = tree.xpath('//*[@class="vpa_list"]/tr/td[1]/a')
+    for mod in mods:
+        mod_ids.append(re.search('(?<=job_id=)\d+', mod.get('href')).group())
+
+    return mod_ids
+
+
+def get_document_list(mod_id, output_dir):
+    """Get list of documents from project modification ID"""
 
     # Get html from mod page
     mod_url = ('http://majorprojects.planning.nsw.gov.au/'
@@ -58,63 +74,53 @@ def get_documents(mod_id, output_dir):
     with open(os.path.join(mod_dir, 'DPE-portal-page.html'), 'w') as f:
         f.write(text)
 
+    document_data = []
     for folder in folders:
         folder_name = folder.xpath('a[2]')[0].text.strip()
 
         # Get documents in current folder
         documents = folder.xpath('ul/li/a')
         for document in documents:
-            document_url = document.get('href')
-            document_name = document.text
-            document_path = os.path.join(output_dir, project_name, mod_name,
-                                         folder_name, document_name)
-
-            # Create output directories as required
-            try:
-                os.makedirs(os.path.dirname(document_path), exist_ok=True)
-            except OSError:
-                logging.error(('Failed to download {}\n'
-                               '  Project: {}\n'
-                               '  Modification: {}\n'
-                               '  Folder: {}\n').format(
-                                   document_name, project_name, mod_name,
-                                   folder_name))
-                continue
-
-            # Download document, if it does not already exist
-            if os.path.isfile(document_path):
-                pass
-            else:
-                try:
-                    r = requests.get(document_url, stream=True)
-                except (ConnectionError, InvalidURL):
-                    logging.error(('Failed to download {}\n'
-                                   '  Project: {}\n'
-                                   '  Modification: {}\n'
-                                   '  Folder: {}\n').format(
-                                       document_name, project_name, mod_name,
-                                       folder_name))
-                    continue
-                with open(document_path, 'wb') as f:
-                    shutil.copyfileobj(r.raw, f)
-
-
-def mod_ids_from_search(search_results_url):
-    """Get modification job IDs from search results URL"""
-
-    # Get HTML of search results page
-    page = requests.get(search_results_url)
-
-    # Create HTML tree
-    tree = html.fromstring(page.content)
-
-    # Find job ids of items in results list
-    mod_ids = []
-    mods = tree.xpath('//*[@class="vpa_list"]/tr/td[1]/a')
-    for mod in mods:
-        mod_ids.append(re.search('(?<=job_id=)\d+', mod.get('href')).group())
-
-    return mod_ids
+            doc = {}
+            doc['url'] = document.get('href')
+            doc['name'] = document.text
+            doc['document_path'] = os.path.join(
+                output_dir, project_name, mod_name, folder_name, doc['name'])
+            document_data.append(doc)
+
+    return document_data
+
+
+def download_document(url, document_path):
+    """Download document from given url"""
+    
+    # Create output directories as required
+    try:
+        os.makedirs(os.path.dirname(document_path), exist_ok=True)
+    except OSError:
+        logging.error(('Failed to download {4}\n'
+                       '  Project: {1}\n'
+                       '  Modification: {2}\n'
+                       '  Folder: {3}\n').format(*document_path.split(os.sep)))
+        return
+
+    # Check if file exists
+    if os.path.isfile(document_path):
+        pass
+    else:
+        try:
+            # Attempt to download file
+            r = requests.get(url, stream=True)
+        except (ConnectionError, InvalidURL):
+            logging.error(
+                ('Failed to download {4}\n'
+                 '  Project: {1}\n'
+                 '  Modification: {2}\n'
+                 '  Folder: {3}\n').format(*document_path.split(os.sep)))
+            return
+        # Write file to disk
+        with open(document_path, 'wb') as f:
+            shutil.copyfileobj(r.raw, f)
 
 
 def main():
@@ -162,9 +168,26 @@ def main():
         search_mod_ids = mod_ids_from_search(search_results_url)
         mod_ids.extend(search_mod_ids)
 
-    # Download documents from given modification ids
-    for mod_id in tqdm(mod_ids):
-        get_documents(mod_id, output_dir)
+    # Get list of documents from given modification IDs
+    mod_pbar = tqdm(mod_ids)
+    for mod_id in mod_pbar:
+        document_data = get_document_list(mod_id, output_dir)
+
+        # Download documents for current modification
+        doc_pbar = tqdm(document_data)
+
+        for doc in doc_pbar:
+            # Update progress bars
+            mod_name = doc['document_path'].split(os.sep)[-3]
+            doc_name = doc['document_path'].split(os.sep)[-1]
+            mod_pbar.set_description(mod_name)
+            doc_pbar.set_description(doc_name)
+
+            # Download document
+            download_document(doc['url'], doc['document_path'])
+
+    # Tidy up console after tqdm
+    print('\n')
 
 
 if __name__ == '__main__':