|
|
|
@ -3,19 +3,13 @@ Download documents from the NSW DPE Major Projects website.
|
|
|
|
|
|
|
|
|
|
Example usage:
|
|
|
|
|
# Grab a single project modification using its job id, and save in 'files'
|
|
|
|
|
python major_projects_grabber.py -o files -i 1019
|
|
|
|
|
major_projects_grabber -o files -i 1746
|
|
|
|
|
|
|
|
|
|
# Grab all modifications in search results page, and save in current folder
|
|
|
|
|
python major_projects_grabber.py -o . http://
|
|
|
|
|
major_projects_grabber "http://majorprojects.planning.nsw.gov.au/index.pl?action=search&authority_id=547"
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
# search url
|
|
|
|
|
# http://majorprojects.planning.nsw.gov.au/index.pl?action=search&page_id=&search=&authority_id=&search_site_type_id=10&reference_table=&status_id=&decider=&from_date=&to_date=&x=31&y=16
|
|
|
|
|
|
|
|
|
|
# mod url
|
|
|
|
|
# http://majorprojects.planning.nsw.gov.au/index.pl?action=view_job&job_id=9503
|
|
|
|
|
|
|
|
|
|
import os
|
|
|
|
|
import re
|
|
|
|
|
import sys
|
|
|
|
@ -25,6 +19,7 @@ import requests
|
|
|
|
|
import argparse
|
|
|
|
|
import pandas as pd
|
|
|
|
|
from lxml import html
|
|
|
|
|
from tqdm import tqdm
|
|
|
|
|
from requests.exceptions import ConnectionError, InvalidURL
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -126,10 +121,10 @@ def main():
|
|
|
|
|
example_text = """examples:
|
|
|
|
|
|
|
|
|
|
# Grab a single project modification using its job id, and save in 'files'
|
|
|
|
|
python major_projects_grabber.py -o files -i 1019
|
|
|
|
|
major_projects_grabber -i 1746 -o files
|
|
|
|
|
|
|
|
|
|
# Grab all modifications in search results page, and save in current folder
|
|
|
|
|
python major_projects_grabber.py -o . http://
|
|
|
|
|
major_projects_grabber http://majorprojects.planning.nsw.gov.au/index.pl?action=search&authority_id=547
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
# Set up command line arguments
|
|
|
|
@ -168,7 +163,7 @@ def main():
|
|
|
|
|
mod_ids.extend(search_mod_ids)
|
|
|
|
|
|
|
|
|
|
# Download documents from given modification ids
|
|
|
|
|
for mod_id in mod_ids:
|
|
|
|
|
for mod_id in tqdm(mod_ids):
|
|
|
|
|
get_documents(mod_id, output_dir)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|