Source code for discrete_optimization.datasets

"""Fetch datasets for examples and tests."""


#  Copyright (c) 2022 AIRBUS and its affiliates.
#  This source code is licensed under the MIT license found in the
#  LICENSE file in the root directory of this source tree.

import glob
import gzip
import os
import shutil
import tempfile
import zipfile
from typing import Optional
from urllib.request import urlcleanup, urlretrieve

DO_DEFAULT_DATAHOME = "~/discrete_optimization_data"
DO_DEFAULT_DATAHOME_ENVVARNAME = "DISCRETE_OPTIMIZATION_DATA"

COURSERA_REPO_URL = "https://github.com/discreteoptimization/assignment"
COURSERA_REPO_URL_SHA1 = "f69378420ce2bb845abaef0f448eab303aa7a7e7"
COURSERA_DATASETS = ["coloring", "facility", "knapsack", "tsp", "vrp"]
COURSERA_DATADIRNAME = "data"

SOLUTIONSUPDATE_BASE_URL = (
    "http://solutionsupdate.ugent.be/sites/default/files/datasets/instances"
)
SOLUTIONSUPDATE_DATASETS = ["RG30", "RG300"]

PSPLIB_FILES_BASE_URL = "https://www.om-db.wi.tum.de/psplib/files"
PSPLIB_DATASETS = {
    "j10.mm": "j1010_",
    "j120.sm": "j1201_",
    "j30.sm": "j301_",
    "j60.sm": "j601_",
}


IMOPSE_REPO_URL = "https://github.com/imopse/iMOPSE"
IMOPSE_REPO_URL_SHA1 = "e58ace53202ec29aa548dd0678ae3164d8349f4e"
IMOPSE_DATASET_RELATIVE_PATH = "configurations/problems/MSRCPSP/Regular"

MSPSPLIB_REPO_URL = "https://github.com/youngkd/MSPSP-InstLib"
MSPSPLIB_REPO_URL_SHA1 = "f77644175b84beed3bd365315412abee1a15eea1"

JSPLIB_REPO_URL = "https://github.com/tamy0612/JSPLIB"
JSPLIB_REPO_URL_SHA1 = "eea2b60dd7e2f5c907ff7302662c61812eb7efdf"

MSLIB_DATASET_URL = "http://www.projectmanagement.ugent.be/sites/default/files/datasets/MSRCPSP/MSLIB.zip"
MSLIB_DATASET_RELATIVE_PATH = "MSLIB.zip"

MIS_FILES = [
    "https://oeis.org/A265032/a265032_1dc.64.txt.gz",
    "https://oeis.org/A265032/a265032_1dc.128.txt.gz",
    "https://oeis.org/A265032/a265032_1dc.256.txt.gz",
    "https://oeis.org/A265032/a265032_1dc.512.txt.gz",
    "https://oeis.org/A265032/a265032_1dc.1024.txt.gz",
    "https://oeis.org/A265032/a265032_1dc.2048.txt.gz",
    "https://oeis.org/A265032/a265032_2dc.128.txt.gz",
    "https://oeis.org/A265032/a265032_2dc.256.txt.gz",
    "https://oeis.org/A265032/a265032_2dc.512.txt.gz",
    "https://oeis.org/A265032/a265032_2dc.1024.txt.gz",
    "https://oeis.org/A265032/a265032_2dc.2048.txt.gz",
    "https://oeis.org/A265032/a265032_1tc.8.txt.gz",
    "https://oeis.org/A265032/a265032_1tc.16.txt.gz",
    "https://oeis.org/A265032/a265032_1tc.32.txt.gz",
    "https://oeis.org/A265032/a265032_1tc.64.txt.gz",
    "https://oeis.org/A265032/a265032_1tc.128.txt.gz",
    "https://oeis.org/A265032/a265032_1tc.256.txt.gz",
    "https://oeis.org/A265032/a265032_1tc.512.txt.gz",
    "https://oeis.org/A265032/a265032_1tc.1024.txt.gz",
    "https://oeis.org/A265032/a265032_1tc.2048.txt.gz",
    "https://oeis.org/A265032/a265032_1et.64.txt.gz",
    "https://oeis.org/A265032/a265032_1et.128.txt.gz",
    "https://oeis.org/A265032/a265032_1et.256.txt.gz",
    "https://oeis.org/A265032/a265032_1et.512.txt.gz",
    "https://oeis.org/A265032/a265032_1et.1024.txt.gz",
    "https://oeis.org/A265032/a265032_1et.2048.txt.gz",
    "https://oeis.org/A265032/a265032_1zc.128.txt.gz",
    "https://oeis.org/A265032/a265032_1zc.256.txt.gz",
    "https://oeis.org/A265032/a265032_1zc.512.txt.gz",
    "https://oeis.org/A265032/a265032_1zc.1024.txt.gz",
    "https://oeis.org/A265032/a265032_1zc.2048.txt.gz",
    "https://oeis.org/A265032/a265032_1zc.4096.txt.gz",
]


[docs] def get_data_home(data_home: Optional[str] = None) -> str: """Return the path of the discrete-optimization data directory. This folder is used by some large dataset loaders to avoid downloading the data several times. By default the data dir is set to a folder named 'discrete_optimization_data' in the user home folder. Alternatively, it can be set by the 'DISCRETE_OPTIMIZATION_DATA' environment variable or programmatically by giving an explicit folder path. The '~' symbol is expanded to the user home folder. If the folder does not already exist, it is automatically created. Params: data_home : The path to discrete-optimization data directory. If `None`, the default path is `~/discrete_optimization_data`. """ if data_home is None: data_home = os.environ.get(DO_DEFAULT_DATAHOME_ENVVARNAME, DO_DEFAULT_DATAHOME) data_home = os.path.expanduser(data_home) os.makedirs(data_home, exist_ok=True) return data_home
[docs] def fetch_data_from_coursera(data_home: Optional[str] = None): """Fetch data from coursera repo. https://github.com/discreteoptimization/assignment Params: data_home: Specify the cache folder for the datasets. By default all discrete-optimization data is stored in '~/discrete_optimization_data' subfolders. """ #  get the proper data directory data_home = get_data_home(data_home=data_home) # download in a temporary file the repo data url = f"{COURSERA_REPO_URL}/archive/{COURSERA_REPO_URL_SHA1}.zip" try: local_file_path, headers = urlretrieve(url) # extract only data with zipfile.ZipFile(local_file_path) as zipf: namelist = zipf.namelist() rootdir = namelist[0].split("/")[0] for dataset in COURSERA_DATASETS: dataset_dir = f"{data_home}/{dataset}" os.makedirs(dataset_dir, exist_ok=True) dataset_prefix_in_zip = f"{rootdir}/{dataset}/{COURSERA_DATADIRNAME}/" for name in namelist: if name.startswith(dataset_prefix_in_zip): zipf.extract(name, path=dataset_dir) for datafile in glob.glob(f"{dataset_dir}/{dataset_prefix_in_zip}/*"): os.replace( src=datafile, dst=f"{dataset_dir}/{os.path.basename(datafile)}" ) os.removedirs(f"{dataset_dir}/{dataset_prefix_in_zip}") finally: urlcleanup()
[docs] def fetch_data_from_solutionsupdate(data_home: Optional[str] = None): """Fetch data for rcpsp examples from solutionsupdate. cf http://solutionsupdate.ugent.be/index.php/solutions-update Params: data_home: Specify the cache folder for the datasets. By default all discrete-optimization data is stored in '~/discrete_optimization_data' subfolders. """ # get the proper data directory data_home = get_data_home(data_home=data_home) # get rcpsp data directory rcpsp_dir = f"{data_home}/rcpsp" os.makedirs(rcpsp_dir, exist_ok=True) try: # download each datasets for dataset in SOLUTIONSUPDATE_DATASETS: url = f"{SOLUTIONSUPDATE_BASE_URL}/{dataset}.zip" local_file_path, _ = urlretrieve(url) with zipfile.ZipFile(local_file_path) as zipf: namelist = zipf.namelist() for name in namelist: zipf.extract(name, path=rcpsp_dir) finally: # remove temporary files urlcleanup()
[docs] def fetch_data_from_psplib(data_home: Optional[str] = None): """Fetch data for rcpsp examples from psplib. cf https://www.om-db.wi.tum.de/psplib/data.html Params: data_home: Specify the cache folder for the datasets. By default all discrete-optimization data is stored in '~/discrete_optimization_data' subfolders. """ #  get the proper data directory data_home = get_data_home(data_home=data_home) # get rcpsp data directory rcpsp_dir = f"{data_home}/rcpsp" os.makedirs(rcpsp_dir, exist_ok=True) try: # download each datasets for dataset, prefix in PSPLIB_DATASETS.items(): url = f"{PSPLIB_FILES_BASE_URL}/{dataset}.zip" local_file_path, _ = urlretrieve(url) with zipfile.ZipFile(local_file_path) as zipf: namelist = zipf.namelist() for name in namelist: if name.startswith(prefix): zipf.extract(name, path=rcpsp_dir) finally: # remove temporary files urlcleanup()
[docs] def fetch_data_from_imopse(data_home: Optional[str] = None): """Fetch data from iMOPSE repo for rcpsp_multiskill examples. https://github.com/imopse/iMOPSE Params: data_home: Specify the cache folder for the datasets. By default all discrete-optimization data is stored in '~/discrete_optimization_data' subfolders. """ # get the proper data directory data_home = get_data_home(data_home=data_home) # get rcpsp_multiskill data directory rcpsp_multiskill_dir = f"{data_home}/rcpsp_multiskill" os.makedirs(rcpsp_multiskill_dir, exist_ok=True) dataset_dir = rcpsp_multiskill_dir # download in a temporary file the repo data url = f"{IMOPSE_REPO_URL}/archive/{IMOPSE_REPO_URL_SHA1}.zip" try: local_file_path, headers = urlretrieve(url) # extract only data with zipfile.ZipFile(local_file_path) as zipf: namelist = zipf.namelist() rootdir = namelist[0].split("/")[0] dataset_prefix_in_zip = f"{rootdir}/{IMOPSE_DATASET_RELATIVE_PATH}/" for name in namelist: if name.startswith(dataset_prefix_in_zip): zipf.extract(name, path=dataset_dir) for datafile in glob.glob(f"{dataset_dir}/{dataset_prefix_in_zip}/*"): os.replace( src=datafile, dst=f"{dataset_dir}/{os.path.basename(datafile)}" ) os.removedirs(f"{dataset_dir}/{dataset_prefix_in_zip}") finally: urlcleanup()
[docs] def fetch_data_from_mspsplib_repo(data_home: Optional[str] = None): """Fetch data from youngkd repo. (for multiskill rcpsp) https://github.com/youngkd/MSPSP-InstLib Params: data_home: Specify the cache folder for the datasets. By default all discrete-optimization data is stored in '~/discrete_optimization_data' subfolders. """ #  get the proper data directory data_home = get_data_home(data_home=data_home) # download in a temporary file the repo data url = f"{MSPSPLIB_REPO_URL}/archive/{MSPSPLIB_REPO_URL_SHA1}.zip" try: local_file_path, headers = urlretrieve(url) # extract only data with zipfile.ZipFile(local_file_path) as zipf: namelist = zipf.namelist() rootdir = namelist[0].split("/")[0] dataset_dir = f"{data_home}/MSPSP_Instances" os.makedirs(dataset_dir, exist_ok=True) dataset_prefix_in_zip = f"{rootdir}/instances/" for name in namelist: if name.startswith(dataset_prefix_in_zip): zipf.extract(name, path=dataset_dir) for datafile in glob.glob(f"{dataset_dir}/{dataset_prefix_in_zip}/*"): os.replace( src=datafile, dst=f"{dataset_dir}/{os.path.basename(datafile)}" ) os.removedirs(f"{dataset_dir}/{dataset_prefix_in_zip}") finally: urlcleanup()
[docs] def fetch_data_from_mslib(data_home: Optional[str] = None): """Fetch data from MSLIB for rcpsp_multiskill examples. cf https://www.projectmanagement.ugent.be/research/project_scheduling/MSRCPSP Params: data_home: Specify the cache folder for the datasets. By default all discrete-optimization data is stored in '~/discrete_optimization_data' subfolders. """ # get the proper data directory data_home = get_data_home(data_home=data_home) # get rcpsp_multiskill data directory rcpsp_multiskill_dir = f"{data_home}/rcpsp_multiskill_mslib" os.makedirs(rcpsp_multiskill_dir, exist_ok=True) try: # download dataset local_file_path, headers = urlretrieve(MSLIB_DATASET_URL) with tempfile.TemporaryDirectory() as tmpdir: # extract only data with zipfile.ZipFile(local_file_path) as zipf: zipf.extractall(path=rcpsp_multiskill_dir) finally: # remove temporary files urlcleanup()
[docs] def decompress_gz_to_folder(input_file, output_folder, url): with gzip.open(input_file, "rb") as f_in: # Get the base name of the gzipped file without the .gz extension base_name = url[33:] file_name = os.path.splitext(base_name)[0] # Create the output folder if it doesn't exist os.makedirs(output_folder, exist_ok=True) # Construct the output file path for each extracted file output_file = os.path.join(output_folder, f"{file_name}") # Open the output file in write-binary mode ('wb') with open(output_file, "wb") as f_out: # Write the extracted file data to the output file shutil.copyfileobj(f_in, f_out)
[docs] def fetch_data_for_mis(data_home: Optional[str] = None): # get the proper data directory data_home = get_data_home(data_home=data_home) # get mis data directory mis_dir = f"{data_home}/mis" os.makedirs(mis_dir, exist_ok=True) try: # download each datasets for url in MIS_FILES: filename, _ = urlretrieve(url) decompress_gz_to_folder(filename, mis_dir, url) finally: # remove temporary files urlcleanup()
[docs] def fetch_data_from_jsplib_repo(data_home: Optional[str] = None): """Fetch data from jsplib repo. (for jobshop problems) https://github.com/tamy0612/JSPLIB Params: data_home: Specify the cache folder for the datasets. By default all discrete-optimization data is stored in '~/discrete_optimization_data' subfolders. """ #  get the proper data directory data_home = get_data_home(data_home=data_home) # download in a temporary file the repo data url = f"{JSPLIB_REPO_URL}/archive/{JSPLIB_REPO_URL_SHA1}.zip" try: local_file_path, headers = urlretrieve(url) # extract only data with zipfile.ZipFile(local_file_path) as zipf: namelist = zipf.namelist() rootdir = namelist[0].split("/")[0] dataset_dir = f"{data_home}/jobshop" os.makedirs(dataset_dir, exist_ok=True) dataset_prefix_in_zip = f"{rootdir}/instances/" for name in namelist: if name.startswith(dataset_prefix_in_zip): zipf.extract(name, path=dataset_dir) for datafile in glob.glob(f"{dataset_dir}/{dataset_prefix_in_zip}/*"): os.replace( src=datafile, dst=f"{dataset_dir}/{os.path.basename(datafile)}" ) os.removedirs(f"{dataset_dir}/{dataset_prefix_in_zip}") finally: urlcleanup()
[docs] def fetch_data_fjsp(data_home: Optional[str] = None): data_home = get_data_home(data_home=data_home) url = "https://openhsu.ub.hsu-hh.de/bitstreams/4ed8d5b1-2546-4a30-8f3a-a8f3732ffbbd/download" try: local_file_path, headers = urlretrieve(url) # extract only data dataset_dir = f"{data_home}/jfsp_openhsu" os.makedirs(dataset_dir, exist_ok=True) with zipfile.ZipFile(local_file_path) as zipf: zipf.extractall(path=dataset_dir) finally: urlcleanup()
[docs] def fetch_all_datasets(data_home: Optional[str] = None): """Fetch data used by examples for all packages. Params: data_home: Specify the cache folder for the datasets. By default all discrete-optimization data is stored in '~/discrete_optimization_data' subfolders. """ fetch_data_from_coursera(data_home=data_home) fetch_data_from_psplib(data_home=data_home) fetch_data_from_imopse(data_home=data_home) fetch_data_from_solutionsupdate(data_home=data_home) fetch_data_for_mis(data_home=data_home) fetch_data_from_jsplib_repo(data_home=data_home) fetch_data_fjsp(data_home=data_home)
if __name__ == "__main__": fetch_all_datasets()