Add progress bars

7 years ago · e9f99f54de
parent f701e4212b
commit e9f99f54de
1 changed files with 81 additions and 58 deletions
--- a/major_projects_grabber/major_projects_grabber.py
+++ b/major_projects_grabber/major_projects_grabber.py
@ -17,15 +17,31 @@ import shutil
 import logging
 import requests
 import argparse
 import pandas as pd
 from lxml import html
 from tqdm import tqdm
 from requests.exceptions import ConnectionError, InvalidURL
-def get_documents(mod_id, output_dir):
+def mod_ids_from_search(search_results_url):
-    """Download all documents from project modification ID
+    """Get modification job IDs from search results URL"""
-    """
+
    # Get HTML of search results page
    page = requests.get(search_results_url)
    # Create HTML tree
    tree = html.fromstring(page.content)
    # Find job ids of items in results list
    mod_ids = []
    mods = tree.xpath('//*[@class="vpa_list"]/tr/td[1]/a')
    for mod in mods:
        mod_ids.append(re.search('(?<=job_id=)\d+', mod.get('href')).group())
    return mod_ids
 def get_document_list(mod_id, output_dir):
    """Get list of documents from project modification ID"""
    # Get html from mod page
    mod_url = ('http://majorprojects.planning.nsw.gov.au/'
@ -58,63 +74,53 @@ def get_documents(mod_id, output_dir):
    with open(os.path.join(mod_dir, 'DPE-portal-page.html'), 'w') as f:
        f.write(text)
    document_data = []
    for folder in folders:
        folder_name = folder.xpath('a[2]')[0].text.strip()
        # Get documents in current folder
        documents = folder.xpath('ul/li/a')
        for document in documents:
-            document_url = document.get('href')
+            doc = {}
-            document_name = document.text
+            doc['url'] = document.get('href')
-            document_path = os.path.join(output_dir, project_name, mod_name,
+            doc['name'] = document.text
-                                         folder_name, document_name)
+            doc['document_path'] = os.path.join(
-
+                output_dir, project_name, mod_name, folder_name, doc['name'])
-            # Create output directories as required
+            document_data.append(doc)
-            try:
+
-                os.makedirs(os.path.dirname(document_path), exist_ok=True)
+    return document_data
-            except OSError:
+
-                logging.error(('Failed to download {}\n'
+
-                               '  Project: {}\n'
+def download_document(url, document_path):
-                               '  Modification: {}\n'
+    """Download document from given url"""
-                               '  Folder: {}\n').format(
+    
-                                   document_name, project_name, mod_name,
+    # Create output directories as required
-                                   folder_name))
+    try:
-                continue
+        os.makedirs(os.path.dirname(document_path), exist_ok=True)
-
+    except OSError:
-            # Download document, if it does not already exist
+        logging.error(('Failed to download {4}\n'
-            if os.path.isfile(document_path):
+                       '  Project: {1}\n'
-                pass
+                       '  Modification: {2}\n'
-            else:
+                       '  Folder: {3}\n').format(*document_path.split(os.sep)))
-                try:
+        return
-                    r = requests.get(document_url, stream=True)
+
-                except (ConnectionError, InvalidURL):
+    # Check if file exists
-                    logging.error(('Failed to download {}\n'
+    if os.path.isfile(document_path):
-                                   '  Project: {}\n'
+        pass
-                                   '  Modification: {}\n'
+    else:
-                                   '  Folder: {}\n').format(
+        try:
-                                       document_name, project_name, mod_name,
+            # Attempt to download file
-                                       folder_name))
+            r = requests.get(url, stream=True)
-                    continue
+        except (ConnectionError, InvalidURL):
-                with open(document_path, 'wb') as f:
+            logging.error(
-                    shutil.copyfileobj(r.raw, f)
+                ('Failed to download {4}\n'
-
+                 '  Project: {1}\n'
-
+                 '  Modification: {2}\n'
-def mod_ids_from_search(search_results_url):
+                 '  Folder: {3}\n').format(*document_path.split(os.sep)))
-    """Get modification job IDs from search results URL"""
+            return
-
+        # Write file to disk
-    # Get HTML of search results page
+        with open(document_path, 'wb') as f:
-    page = requests.get(search_results_url)
+            shutil.copyfileobj(r.raw, f)
    # Create HTML tree
    tree = html.fromstring(page.content)
    # Find job ids of items in results list
    mod_ids = []
    mods = tree.xpath('//*[@class="vpa_list"]/tr/td[1]/a')
    for mod in mods:
        mod_ids.append(re.search('(?<=job_id=)\d+', mod.get('href')).group())
    return mod_ids
 def main():
@ -162,9 +168,26 @@ def main():
        search_mod_ids = mod_ids_from_search(search_results_url)
        mod_ids.extend(search_mod_ids)
-    # Download documents from given modification ids
+    # Get list of documents from given modification IDs
-    for mod_id in tqdm(mod_ids):
+    mod_pbar = tqdm(mod_ids)
-        get_documents(mod_id, output_dir)
+    for mod_id in mod_pbar:
        document_data = get_document_list(mod_id, output_dir)
        # Download documents for current modification
        doc_pbar = tqdm(document_data)
        for doc in doc_pbar:
            # Update progress bars
            mod_name = doc['document_path'].split(os.sep)[-3]
            doc_name = doc['document_path'].split(os.sep)[-1]
            mod_pbar.set_description(mod_name)
            doc_pbar.set_description(doc_name)
            # Download document
            download_document(doc['url'], doc['document_path'])
    # Tidy up console after tqdm
    print('\n')
 if __name__ == '__main__':