|
|
@ -22,6 +22,18 @@ from tqdm import tqdm
|
|
|
|
from requests.exceptions import ConnectionError, InvalidURL
|
|
|
|
from requests.exceptions import ConnectionError, InvalidURL
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def make_safe(s):
|
|
|
|
|
|
|
|
"""Remove characters that would be invalid in a filepath"""
|
|
|
|
|
|
|
|
# Remove '\', '*', '"', '<', '>' '|'
|
|
|
|
|
|
|
|
s_safe = re.sub('\\\|\*|"|<|>\|', '', s)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Replace '/' and ':' with '-'
|
|
|
|
|
|
|
|
s_safe = re.sub(':', ' -', s_safe)
|
|
|
|
|
|
|
|
s_safe = re.sub('/', '-', s_safe)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return s_safe
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def mod_ids_from_search(search_results_url):
|
|
|
|
def mod_ids_from_search(search_results_url):
|
|
|
|
"""Get modification job IDs from search results URL"""
|
|
|
|
"""Get modification job IDs from search results URL"""
|
|
|
|
|
|
|
|
|
|
|
@ -58,6 +70,10 @@ def get_document_list(mod_id, output_dir):
|
|
|
|
# Get list of document folders
|
|
|
|
# Get list of document folders
|
|
|
|
folders = mod_tree.xpath('//div[@class="folder_row"]')
|
|
|
|
folders = mod_tree.xpath('//div[@class="folder_row"]')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Remove invalid characters before creating folders
|
|
|
|
|
|
|
|
project_name = make_safe(project_name)
|
|
|
|
|
|
|
|
mod_name = make_safe(mod_name)
|
|
|
|
|
|
|
|
|
|
|
|
# Create modification folder
|
|
|
|
# Create modification folder
|
|
|
|
mod_dir = os.path.join(output_dir, project_name, mod_name)
|
|
|
|
mod_dir = os.path.join(output_dir, project_name, mod_name)
|
|
|
|
os.makedirs(mod_dir, exist_ok=True)
|
|
|
|
os.makedirs(mod_dir, exist_ok=True)
|
|
|
@ -94,15 +110,14 @@ def get_document_list(mod_id, output_dir):
|
|
|
|
def download_document(url, document_path):
|
|
|
|
def download_document(url, document_path):
|
|
|
|
"""Download document from given url"""
|
|
|
|
"""Download document from given url"""
|
|
|
|
|
|
|
|
|
|
|
|
# Create output directories as required
|
|
|
|
# Check if destination path is too long (Windows filename limitation)
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
os.makedirs(os.path.dirname(document_path), exist_ok=True)
|
|
|
|
open(document_path, 'a').close()
|
|
|
|
except OSError:
|
|
|
|
except FileNotFoundError:
|
|
|
|
logging.error(('Failed to download {4}\n'
|
|
|
|
document_path = '\\\\?\\' + os.path.abspath(document_path)
|
|
|
|
' Project: {1}\n'
|
|
|
|
|
|
|
|
' Modification: {2}\n'
|
|
|
|
# Create output directories as required
|
|
|
|
' Folder: {3}\n').format(*document_path.split(os.sep)))
|
|
|
|
os.makedirs(os.path.dirname(document_path), exist_ok=True)
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Check if file exists
|
|
|
|
# Check if file exists
|
|
|
|
if os.path.isfile(document_path):
|
|
|
|
if os.path.isfile(document_path):
|
|
|
@ -119,12 +134,6 @@ def download_document(url, document_path):
|
|
|
|
' Folder: {3}\n').format(*document_path.split(os.sep)))
|
|
|
|
' Folder: {3}\n').format(*document_path.split(os.sep)))
|
|
|
|
return
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
# Check if destination path is too long (Windows filename limitation)
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
|
|
open(document_path, 'a').close()
|
|
|
|
|
|
|
|
except FileNotFoundError:
|
|
|
|
|
|
|
|
document_path = '\\\\?\\' + os.path.abspath(document_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Write file to disk
|
|
|
|
# Write file to disk
|
|
|
|
with open(document_path, 'wb') as f:
|
|
|
|
with open(document_path, 'wb') as f:
|
|
|
|
shutil.copyfileobj(r.raw, f)
|
|
|
|
shutil.copyfileobj(r.raw, f)
|
|
|
@ -191,6 +200,7 @@ def main():
|
|
|
|
doc_pbar.set_description(doc_name)
|
|
|
|
doc_pbar.set_description(doc_name)
|
|
|
|
|
|
|
|
|
|
|
|
# Download document
|
|
|
|
# Download document
|
|
|
|
|
|
|
|
print(doc)
|
|
|
|
download_document(doc['url'], doc['document_path'])
|
|
|
|
download_document(doc['url'], doc['document_path'])
|
|
|
|
|
|
|
|
|
|
|
|
# Tidy up console after tqdm
|
|
|
|
# Tidy up console after tqdm
|
|
|
|