Source code for discrete_optimization.salbp.parser

import os
from typing import Optional

from discrete_optimization.datasets import ERROR_MSG_MISSING_DATASETS, get_data_home
from discrete_optimization.salbp.problem import SalbpProblem


[docs] def get_data_available( data_folder: Optional[str] = None, data_home: Optional[str] = None ) -> list[str]: """Get datasets available for tsp. Params: data_folder: folder where datasets for weighted tardiness problem should be found. If None, we look in "wt" subdirectory of `data_home`. data_home: root directory for all datasets. Is None, set by default to "~/discrete_optimization_data " """ if data_folder is None: data_home = get_data_home(data_home=data_home) data_folder = f"{data_home}/salpb" try: subfolders = [ f for f in os.listdir(data_folder) if os.path.isdir(os.path.join(data_folder, f)) ] files = [] print(subfolders) for subfolder in subfolders: sf = os.path.join(data_folder, subfolder) files.extend([os.path.join(sf, f) for f in os.listdir(sf) if "alb" in f]) return files except FileNotFoundError as e: raise FileNotFoundError(str(e) + ERROR_MSG_MISSING_DATASETS)
[docs] def remove_artifacts(text: str) -> str: cleaned = [] i = 0 n = len(text) while i < n: if text[i] == "[": # Check if this is a source tag # We look ahead to see if it closes with ] # If so, we skip until ] j = text.find("]", i) if j != -1: # check if it looks like a source tag to be safe # (optional, but good practice) snippet = text[i : j + 1] if "source" in snippet: i = j + 1 continue cleaned.append(text[i]) i += 1 return "".join(cleaned)
[docs] def parse_alb_file(file_path: str) -> SalbpProblem: """ Parses a .alb file using string splitting and tokenization. No Regular Expressions used. """ with open(file_path, "r", encoding="utf-8") as f: raw_content = f.read() # 1. Clean artifacts content = remove_artifacts(raw_content) data = {"number_of_tasks": 0, "cycle_time": 0, "task_times": {}, "precedence": []} # 2. Split by lines to find section headers, but treat data as tokens # We will identify the index in the 'tokens' list where sections start. # Normalize newlines and split into tokens (words/numbers) # This handles the case where "id" is on one line and "time" on the next. tokens = content.split() # State machine parsing over tokens current_section = None iterator = iter(tokens) try: while True: token = next(iterator) # Detect Tags if token.startswith("<"): # Reconstruct full tag if it has spaces (e.g., <number of tasks>) tag_acc = [token] while not token.endswith(">"): token = next(iterator) tag_acc.append(token) full_tag = " ".join(tag_acc).replace("<", "").replace(">", "") current_section = full_tag continue # Process Data based on current section if current_section == "number of tasks": data["number_of_tasks"] = int(token) current_section = None # Done with this section elif current_section == "cycle time": data["cycle_time"] = int(token) current_section = None # Done with this section elif current_section == "task times": # Expecting pairs: ID Duration t_id = int(token) t_time = int(next(iterator)) data["task_times"][t_id] = t_time elif current_section == "precedence relations": # Expecting: pred,succ (possibly with spaces if file is messy, but usually no spaces around comma) # If the parser split "1,2" into "1,2", we good. # If it split "1, 2", we need to handle comma. if "," in token: parts = token.split(",") p = int(parts[0]) s = int(parts[1]) data["precedence"].append((p, s)) else: # Maybe format is "1 2"? Standard is comma. # Let's assume standard "1,9" format based on file provided. pass except StopIteration: pass return SalbpProblem( data["number_of_tasks"], data["cycle_time"], data["task_times"], data["precedence"], )