diff --git a/openml/__init__.py b/openml/__init__.py index ae5db261f..21dda24ad 100644 --- a/openml/__init__.py +++ b/openml/__init__.py @@ -33,6 +33,7 @@ utils, ) from .__version__ import __version__ +from ._api import _backend from .datasets import OpenMLDataFeature, OpenMLDataset from .evaluations import OpenMLEvaluation from .flows import OpenMLFlow @@ -109,6 +110,7 @@ def populate_cache( "OpenMLTask", "__version__", "_api_calls", + "_backend", "config", "datasets", "evaluations", diff --git a/openml/_api/__init__.py b/openml/_api/__init__.py new file mode 100644 index 000000000..926fee3d4 --- /dev/null +++ b/openml/_api/__init__.py @@ -0,0 +1,95 @@ +from .clients import ( + HTTPCache, + HTTPClient, + MinIOClient, +) +from .resources import ( + API_REGISTRY, + DatasetAPI, + DatasetV1API, + DatasetV2API, + EstimationProcedureAPI, + EstimationProcedureV1API, + EstimationProcedureV2API, + EvaluationAPI, + EvaluationMeasureAPI, + EvaluationMeasureV1API, + EvaluationMeasureV2API, + EvaluationV1API, + EvaluationV2API, + FallbackProxy, + FlowAPI, + FlowV1API, + FlowV2API, + ResourceAPI, + ResourceV1API, + ResourceV2API, + RunAPI, + RunV1API, + RunV2API, + SetupAPI, + SetupV1API, + SetupV2API, + StudyAPI, + StudyV1API, + StudyV2API, + TaskAPI, + TaskV1API, + TaskV2API, +) +from .setup import ( + APIBackend, + APIBackendBuilder, + APIConfig, + CacheConfig, + Config, + ConnectionConfig, + _backend, +) + +__all__ = [ + "API_REGISTRY", + "APIBackend", + "APIBackendBuilder", + "APIConfig", + "CacheConfig", + "Config", + "ConnectionConfig", + "DatasetAPI", + "DatasetV1API", + "DatasetV2API", + "EstimationProcedureAPI", + "EstimationProcedureV1API", + "EstimationProcedureV2API", + "EvaluationAPI", + "EvaluationMeasureAPI", + "EvaluationMeasureV1API", + "EvaluationMeasureV2API", + "EvaluationV1API", + "EvaluationV2API", + "FallbackProxy", + "FallbackProxy", + "FlowAPI", + "FlowV1API", + "FlowV2API", + "HTTPCache", + "HTTPClient", + "MinIOClient", + "ResourceAPI", + "ResourceAPI", + "ResourceV1API", + "ResourceV2API", + "RunAPI", + "RunV1API", + "RunV2API", + "SetupAPI", + "SetupV1API", + "SetupV2API", + "StudyAPI", + "StudyV1API", + "StudyV2API", + "TaskAPI", + "TaskV1API", + "TaskV2API", + "_backend", +] diff --git a/openml/_api/clients/__init__.py b/openml/_api/clients/__init__.py new file mode 100644 index 000000000..42f11fbcf --- /dev/null +++ b/openml/_api/clients/__init__.py @@ -0,0 +1,8 @@ +from .http import HTTPCache, HTTPClient +from .minio import MinIOClient + +__all__ = [ + "HTTPCache", + "HTTPClient", + "MinIOClient", +] diff --git a/openml/_api/clients/http.py b/openml/_api/clients/http.py new file mode 100644 index 000000000..51056bf3a --- /dev/null +++ b/openml/_api/clients/http.py @@ -0,0 +1,480 @@ +from __future__ import annotations + +import hashlib +import json +import logging +import math +import random +import time +import xml +from collections.abc import Callable, Mapping +from pathlib import Path +from typing import Any +from urllib.parse import urlencode, urljoin, urlparse + +import requests +import xmltodict +from requests import Response + +from openml.__version__ import __version__ +from openml.enums import RetryPolicy +from openml.exceptions import ( + OpenMLCacheRequiredError, + OpenMLHashException, + OpenMLNotAuthorizedError, + OpenMLServerError, + OpenMLServerException, + OpenMLServerNoResult, +) + + +class HTTPCache: + def __init__(self, *, path: Path, ttl: int) -> None: + self.path = path + self.ttl = ttl + + def get_key(self, url: str, params: dict[str, Any]) -> str: + parsed_url = urlparse(url) + netloc_parts = parsed_url.netloc.split(".")[::-1] + path_parts = parsed_url.path.strip("/").split("/") + + filtered_params = {k: v for k, v in params.items() if k != "api_key"} + params_part = [urlencode(filtered_params)] if filtered_params else [] + + return str(Path(*netloc_parts, *path_parts, *params_part)) + + def _key_to_path(self, key: str) -> Path: + return self.path.joinpath(key) + + def load(self, key: str) -> Response: + path = self._key_to_path(key) + + if not path.exists(): + raise FileNotFoundError(f"Cache directory not found: {path}") + + meta_path = path / "meta.json" + headers_path = path / "headers.json" + body_path = path / "body.bin" + + if not (meta_path.exists() and headers_path.exists() and body_path.exists()): + raise FileNotFoundError(f"Incomplete cache at {path}") + + with meta_path.open("r", encoding="utf-8") as f: + meta = json.load(f) + + created_at = meta.get("created_at") + if created_at is None: + raise ValueError("Cache metadata missing 'created_at'") + + if time.time() - created_at > self.ttl: + raise TimeoutError(f"Cache expired for {path}") + + with headers_path.open("r", encoding="utf-8") as f: + headers = json.load(f) + + body = body_path.read_bytes() + + response = Response() + response.status_code = meta["status_code"] + response.url = meta["url"] + response.reason = meta["reason"] + response.headers = headers + response._content = body + response.encoding = meta["encoding"] + + return response + + def save(self, key: str, response: Response) -> None: + path = self._key_to_path(key) + path.mkdir(parents=True, exist_ok=True) + + (path / "body.bin").write_bytes(response.content) + + with (path / "headers.json").open("w", encoding="utf-8") as f: + json.dump(dict(response.headers), f) + + meta = { + "status_code": response.status_code, + "url": response.url, + "reason": response.reason, + "encoding": response.encoding, + "elapsed": response.elapsed.total_seconds(), + "created_at": time.time(), + "request": { + "method": response.request.method if response.request else None, + "url": response.request.url if response.request else None, + "headers": dict(response.request.headers) if response.request else None, + "body": response.request.body if response.request else None, + }, + } + + with (path / "meta.json").open("w", encoding="utf-8") as f: + json.dump(meta, f) + + +class HTTPClient: + def __init__( # noqa: PLR0913 + self, + *, + server: str, + base_url: str, + api_key: str, + timeout: int, + retries: int, + retry_policy: RetryPolicy, + cache: HTTPCache | None = None, + ) -> None: + self.server = server + self.base_url = base_url + self.api_key = api_key + self.timeout = timeout + self.retries = retries + self.retry_policy = retry_policy + self.cache = cache + + self.retry_func = ( + self._human_delay if retry_policy == RetryPolicy.HUMAN else self._robot_delay + ) + self.headers: dict[str, str] = {"user-agent": f"openml-python/{__version__}"} + + def _robot_delay(self, n: int) -> float: + wait = (1 / (1 + math.exp(-(n * 0.5 - 4)))) * 60 + variation = random.gauss(0, wait / 10) + return max(1.0, wait + variation) + + def _human_delay(self, n: int) -> float: + return max(1.0, n) + + def _parse_exception_response( + self, + response: Response, + ) -> tuple[int | None, str]: + content_type = response.headers.get("Content-Type", "").lower() + + if "json" in content_type: + server_exception = response.json() + server_error = server_exception["detail"] + code = server_error.get("code") + message = server_error.get("message") + additional_information = server_error.get("additional_information") + else: + server_exception = xmltodict.parse(response.text) + server_error = server_exception["oml:error"] + code = server_error.get("oml:code") + message = server_error.get("oml:message") + additional_information = server_error.get("oml:additional_information") + + if code is not None: + code = int(code) + + if message and additional_information: + full_message = f"{message} - {additional_information}" + elif message: + full_message = message + elif additional_information: + full_message = additional_information + else: + full_message = "" + + return code, full_message + + def _raise_code_specific_error( + self, + code: int, + message: str, + url: str, + files: Mapping[str, Any] | None, + ) -> None: + if code in [111, 372, 512, 500, 482, 542, 674]: + # 512 for runs, 372 for datasets, 500 for flows + # 482 for tasks, 542 for evaluations, 674 for setups + # 111 for dataset descriptions + raise OpenMLServerNoResult(code=code, message=message, url=url) + + # 163: failure to validate flow XML (https://www.openml.org/api_docs#!/flow/post_flow) + if code in [163] and files is not None and "description" in files: + # file_elements['description'] is the XML file description of the flow + message = f"\n{files['description']}\n{message}" + + if code in [ + 102, # flow/exists post + 137, # dataset post + 350, # dataset/42 delete + 310, # flow/ post + 320, # flow/42 delete + 400, # run/42 delete + 460, # task/42 delete + ]: + raise OpenMLNotAuthorizedError( + message=( + f"The API call {url} requires authentication via an API key.\nPlease configure " + "OpenML-Python to use your API as described in this example:" + "\nhttps://openml.github.io/openml-python/latest/examples/Basics/introduction_tutorial/#authentication" + ) + ) + + # Propagate all server errors to the calling functions, except + # for 107 which represents a database connection error. + # These are typically caused by high server load, + # which means trying again might resolve the issue. + # DATABASE_CONNECTION_ERRCODE + if code != 107: + raise OpenMLServerException(code=code, message=message, url=url) + + def _validate_response( + self, + method: str, + url: str, + files: Mapping[str, Any] | None, + response: Response, + ) -> Exception | None: + if ( + "Content-Encoding" not in response.headers + or response.headers["Content-Encoding"] != "gzip" + ): + logging.warning(f"Received uncompressed content from OpenML for {url}.") + + if response.status_code == 200: + return None + + if response.status_code == requests.codes.URI_TOO_LONG: + raise OpenMLServerError(f"URI too long! ({url})") + + retry_raise_e: Exception | None = None + + try: + code, message = self._parse_exception_response(response) + + except (requests.exceptions.JSONDecodeError, xml.parsers.expat.ExpatError) as e: + if method != "GET": + extra = f"Status code: {response.status_code}\n{response.text}" + raise OpenMLServerError( + f"Unexpected server error when calling {url}. Please contact the " + f"developers!\n{extra}" + ) from e + + retry_raise_e = e + + except Exception as e: + # If we failed to parse it out, + # then something has gone wrong in the body we have sent back + # from the server and there is little extra information we can capture. + raise OpenMLServerError( + f"Unexpected server error when calling {url}. Please contact the developers!\n" + f"Status code: {response.status_code}\n{response.text}", + ) from e + + if code is not None: + self._raise_code_specific_error( + code=code, + message=message, + url=url, + files=files, + ) + + if retry_raise_e is None: + retry_raise_e = OpenMLServerException(code=code, message=message, url=url) + + return retry_raise_e + + def _request( # noqa: PLR0913 + self, + method: str, + url: str, + params: Mapping[str, Any], + data: Mapping[str, Any], + headers: Mapping[str, str], + timeout: float | int, + files: Mapping[str, Any] | None, + **request_kwargs: Any, + ) -> tuple[Response | None, Exception | None]: + retry_raise_e: Exception | None = None + response: Response | None = None + + try: + response = requests.request( + method=method, + url=url, + params=params, + data=data, + headers=headers, + timeout=timeout, + files=files, + **request_kwargs, + ) + except ( + requests.exceptions.ChunkedEncodingError, + requests.exceptions.ConnectionError, + requests.exceptions.SSLError, + ) as e: + retry_raise_e = e + + if response is not None: + retry_raise_e = self._validate_response( + method=method, + url=url, + files=files, + response=response, + ) + + return response, retry_raise_e + + def request( # noqa: PLR0913, C901 + self, + method: str, + path: str, + *, + use_cache: bool = False, + reset_cache: bool = False, + use_api_key: bool = False, + md5_checksum: str | None = None, + **request_kwargs: Any, + ) -> Response: + url = urljoin(self.server, urljoin(self.base_url, path)) + retries = max(1, self.retries) + + params = request_kwargs.pop("params", {}).copy() + data = request_kwargs.pop("data", {}).copy() + + if use_api_key: + params["api_key"] = self.api_key + + if method.upper() in {"POST", "PUT", "PATCH"}: + data = {**params, **data} + params = {} + + # prepare headers + headers = request_kwargs.pop("headers", {}).copy() + headers.update(self.headers) + + timeout = request_kwargs.pop("timeout", self.timeout) + files = request_kwargs.pop("files", None) + + if use_cache and not reset_cache and self.cache is not None: + cache_key = self.cache.get_key(url, params) + try: + return self.cache.load(cache_key) + except (FileNotFoundError, TimeoutError): + pass # cache miss or expired, continue + except Exception: + raise # propagate unexpected cache errors + + for retry_counter in range(1, retries + 1): + response, retry_raise_e = self._request( + method=method, + url=url, + params=params, + data=data, + headers=headers, + timeout=timeout, + files=files, + **request_kwargs, + ) + + # executed successfully + if retry_raise_e is None: + break + # tries completed + if retry_counter >= retries: + raise retry_raise_e + + delay = self.retry_func(retry_counter) + time.sleep(delay) + + assert response is not None + + if use_cache and self.cache is not None: + cache_key = self.cache.get_key(url, params) + self.cache.save(cache_key, response) + + if md5_checksum is not None: + self._verify_checksum(response, md5_checksum) + + return response + + def _verify_checksum(self, response: Response, md5_checksum: str) -> None: + # ruff sees hashlib.md5 as insecure + actual = hashlib.md5(response.content).hexdigest() # noqa: S324 + if actual != md5_checksum: + raise OpenMLHashException( + f"Checksum of downloaded file is unequal to the expected checksum {md5_checksum} " + f"when downloading {response.url}.", + ) + + def get( + self, + path: str, + *, + use_cache: bool = False, + reset_cache: bool = False, + use_api_key: bool = False, + md5_checksum: str | None = None, + **request_kwargs: Any, + ) -> Response: + return self.request( + method="GET", + path=path, + use_cache=use_cache, + reset_cache=reset_cache, + use_api_key=use_api_key, + md5_checksum=md5_checksum, + **request_kwargs, + ) + + def post( + self, + path: str, + *, + use_api_key: bool = True, + **request_kwargs: Any, + ) -> Response: + return self.request( + method="POST", + path=path, + use_cache=False, + use_api_key=use_api_key, + **request_kwargs, + ) + + def delete( + self, + path: str, + **request_kwargs: Any, + ) -> Response: + return self.request( + method="DELETE", + path=path, + use_cache=False, + use_api_key=True, + **request_kwargs, + ) + + def download( + self, + url: str, + handler: Callable[[Response, Path, str], Path] | None = None, + encoding: str = "utf-8", + file_name: str = "response.txt", + md5_checksum: str | None = None, + ) -> Path: + if self.cache is None: + raise OpenMLCacheRequiredError( + "A cache object is required for download, but none was provided in the HTTPClient." + ) + base = self.cache.path + file_path = base / "downloads" / urlparse(url).path.lstrip("/") / file_name + file_path = file_path.expanduser() + file_path.parent.mkdir(parents=True, exist_ok=True) + if file_path.exists(): + return file_path + + response = self.get(url, md5_checksum=md5_checksum) + if handler is not None: + return handler(response, file_path, encoding) + + return self._text_handler(response, file_path, encoding) + + def _text_handler(self, response: Response, path: Path, encoding: str) -> Path: + with path.open("w", encoding=encoding) as f: + f.write(response.text) + return path diff --git a/openml/_api/clients/minio.py b/openml/_api/clients/minio.py new file mode 100644 index 000000000..a9b62a7ff --- /dev/null +++ b/openml/_api/clients/minio.py @@ -0,0 +1,139 @@ +from __future__ import annotations + +import contextlib +import shutil +import urllib +import zipfile +from pathlib import Path + +import minio +import requests +from urllib3 import ProxyManager + +import openml +from openml.__version__ import __version__ +from openml.utils import ProgressBar + + +class MinIOClient: + def __init__(self, path: Path) -> None: + self.headers: dict[str, str] = {"user-agent": f"openml-python/{__version__}"} + self.path = path + + def _get_path(self, url: str) -> Path: + parsed_url = urllib.parse.urlparse(url) + return self.path / "minio" / parsed_url.path.lstrip("/") + + def download_minio_file( + self, + source: str, + destination: str | Path | None = None, + exists_ok: bool = True, # noqa: FBT002 + proxy: str | None = "auto", + ) -> Path: + """Download file ``source`` from a MinIO Bucket and store it at ``destination``. + + Parameters + ---------- + source : str + URL to a file in a MinIO bucket. + destination : str | Path + Path to store the file to, if a directory is provided the original filename is used. + exists_ok : bool, optional (default=True) + If False, raise FileExists if a file already exists in ``destination``. + proxy: str, optional (default = "auto") + The proxy server to use. By default it's "auto" which uses ``requests`` to + automatically find the proxy to use. Pass None or the environment variable + ``no_proxy="*"`` to disable proxies. + """ + destination = self._get_path(source) if destination is None else Path(destination) + parsed_url = urllib.parse.urlparse(source) + + # expect path format: /BUCKET/path/to/file.ext + bucket, object_name = parsed_url.path[1:].split("/", maxsplit=1) + if destination.is_dir(): + destination = Path(destination, object_name) + if destination.is_file() and not exists_ok: + raise FileExistsError(f"File already exists in {destination}.") + + destination = destination.expanduser() + destination.parent.mkdir(parents=True, exist_ok=True) + + if proxy == "auto": + resolved_proxies = requests.utils.get_environ_proxies(parsed_url.geturl()) + proxy = requests.utils.select_proxy(parsed_url.geturl(), resolved_proxies) # type: ignore + + proxy_client = ProxyManager(proxy) if proxy else None + + client = minio.Minio(endpoint=parsed_url.netloc, secure=False, http_client=proxy_client) + try: + client.fget_object( + bucket_name=bucket, + object_name=object_name, + file_path=str(destination), + progress=ProgressBar() if openml.config.show_progress else None, + request_headers=self.headers, + ) + if destination.is_file() and destination.suffix == ".zip": + with zipfile.ZipFile(destination, "r") as zip_ref: + zip_ref.extractall(destination.parent) + + except minio.error.S3Error as e: + if e.message is not None and e.message.startswith("Object does not exist"): + raise FileNotFoundError(f"Object at '{source}' does not exist.") from e + # e.g. permission error, or a bucket does not exist (which is also interpreted as a + # permission error on minio level). + raise FileNotFoundError("Bucket does not exist or is private.") from e + + return destination + + def download_minio_bucket(self, source: str, destination: str | Path | None = None) -> None: + """Download file ``source`` from a MinIO Bucket and store it at ``destination``. + + Does not redownload files which already exist. + + Parameters + ---------- + source : str + URL to a MinIO bucket. + destination : str | Path + Path to a directory to store the bucket content in. + """ + destination = self._get_path(source) if destination is None else Path(destination) + parsed_url = urllib.parse.urlparse(source) + if destination.suffix: + destination = destination.parent + # expect path format: /BUCKET/path/to/file.ext + _, bucket, *prefixes, _ = parsed_url.path.split("/") + prefix = "/".join(prefixes) + + client = minio.Minio(endpoint=parsed_url.netloc, secure=False) + + for file_object in client.list_objects(bucket, prefix=prefix, recursive=True): + if file_object.object_name is None: + raise ValueError(f"Object name is None for object {file_object!r}") + if file_object.etag is None: + raise ValueError(f"Object etag is None for object {file_object!r}") + + marker = destination / file_object.etag + if marker.exists(): + continue + + file_destination = destination / file_object.object_name.rsplit("/", 1)[1] + if (file_destination.parent / file_destination.stem).exists(): + # Marker is missing but archive exists means the server archive changed + # force a refresh + shutil.rmtree(file_destination.parent / file_destination.stem) + + with contextlib.suppress(FileExistsError): + self.download_minio_file( + source=source.rsplit("/", 1)[0] + + "/" + + file_object.object_name.rsplit("/", 1)[1], + destination=file_destination, + exists_ok=False, + ) + + if file_destination.is_file() and file_destination.suffix == ".zip": + file_destination.unlink() + marker.touch() diff --git a/openml/_api/resources/__init__.py b/openml/_api/resources/__init__.py new file mode 100644 index 000000000..1f0b2caa1 --- /dev/null +++ b/openml/_api/resources/__init__.py @@ -0,0 +1,65 @@ +from ._registry import API_REGISTRY +from .base import ( + DatasetAPI, + EstimationProcedureAPI, + EvaluationAPI, + EvaluationMeasureAPI, + FallbackProxy, + FlowAPI, + ResourceAPI, + ResourceV1API, + ResourceV2API, + RunAPI, + SetupAPI, + StudyAPI, + TaskAPI, +) +from .dataset import DatasetV1API, DatasetV2API +from .estimation_procedure import ( + EstimationProcedureV1API, + EstimationProcedureV2API, +) +from .evaluation import EvaluationV1API, EvaluationV2API +from .evaluation_measure import EvaluationMeasureV1API, EvaluationMeasureV2API +from .flow import FlowV1API, FlowV2API +from .run import RunV1API, RunV2API +from .setup import SetupV1API, SetupV2API +from .study import StudyV1API, StudyV2API +from .task import TaskV1API, TaskV2API + +__all__ = [ + "API_REGISTRY", + "DatasetAPI", + "DatasetV1API", + "DatasetV2API", + "EstimationProcedureAPI", + "EstimationProcedureV1API", + "EstimationProcedureV2API", + "EvaluationAPI", + "EvaluationMeasureAPI", + "EvaluationMeasureV1API", + "EvaluationMeasureV2API", + "EvaluationV1API", + "EvaluationV2API", + "FallbackProxy", + "FallbackProxy", + "FlowAPI", + "FlowV1API", + "FlowV2API", + "ResourceAPI", + "ResourceAPI", + "ResourceV1API", + "ResourceV2API", + "RunAPI", + "RunV1API", + "RunV2API", + "SetupAPI", + "SetupV1API", + "SetupV2API", + "StudyAPI", + "StudyV1API", + "StudyV2API", + "TaskAPI", + "TaskV1API", + "TaskV2API", +] diff --git a/openml/_api/resources/_registry.py b/openml/_api/resources/_registry.py new file mode 100644 index 000000000..66d7ec428 --- /dev/null +++ b/openml/_api/resources/_registry.py @@ -0,0 +1,49 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from openml.enums import APIVersion, ResourceType + +from .dataset import DatasetV1API, DatasetV2API +from .estimation_procedure import ( + EstimationProcedureV1API, + EstimationProcedureV2API, +) +from .evaluation import EvaluationV1API, EvaluationV2API +from .evaluation_measure import EvaluationMeasureV1API, EvaluationMeasureV2API +from .flow import FlowV1API, FlowV2API +from .run import RunV1API, RunV2API +from .setup import SetupV1API, SetupV2API +from .study import StudyV1API, StudyV2API +from .task import TaskV1API, TaskV2API + +if TYPE_CHECKING: + from .base import ResourceAPI + +API_REGISTRY: dict[ + APIVersion, + dict[ResourceType, type[ResourceAPI]], +] = { + APIVersion.V1: { + ResourceType.DATASET: DatasetV1API, + ResourceType.TASK: TaskV1API, + ResourceType.EVALUATION_MEASURE: EvaluationMeasureV1API, + ResourceType.ESTIMATION_PROCEDURE: EstimationProcedureV1API, + ResourceType.EVALUATION: EvaluationV1API, + ResourceType.FLOW: FlowV1API, + ResourceType.STUDY: StudyV1API, + ResourceType.RUN: RunV1API, + ResourceType.SETUP: SetupV1API, + }, + APIVersion.V2: { + ResourceType.DATASET: DatasetV2API, + ResourceType.TASK: TaskV2API, + ResourceType.EVALUATION_MEASURE: EvaluationMeasureV2API, + ResourceType.ESTIMATION_PROCEDURE: EstimationProcedureV2API, + ResourceType.EVALUATION: EvaluationV2API, + ResourceType.FLOW: FlowV2API, + ResourceType.STUDY: StudyV2API, + ResourceType.RUN: RunV2API, + ResourceType.SETUP: SetupV2API, + }, +} diff --git a/openml/_api/resources/base/__init__.py b/openml/_api/resources/base/__init__.py new file mode 100644 index 000000000..ed6dc26f7 --- /dev/null +++ b/openml/_api/resources/base/__init__.py @@ -0,0 +1,30 @@ +from .base import ResourceAPI +from .fallback import FallbackProxy +from .resources import ( + DatasetAPI, + EstimationProcedureAPI, + EvaluationAPI, + EvaluationMeasureAPI, + FlowAPI, + RunAPI, + SetupAPI, + StudyAPI, + TaskAPI, +) +from .versions import ResourceV1API, ResourceV2API + +__all__ = [ + "DatasetAPI", + "EstimationProcedureAPI", + "EvaluationAPI", + "EvaluationMeasureAPI", + "FallbackProxy", + "FlowAPI", + "ResourceAPI", + "ResourceV1API", + "ResourceV2API", + "RunAPI", + "SetupAPI", + "StudyAPI", + "TaskAPI", +] diff --git a/openml/_api/resources/base/base.py b/openml/_api/resources/base/base.py new file mode 100644 index 000000000..5eadc4932 --- /dev/null +++ b/openml/_api/resources/base/base.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, NoReturn + +from openml.exceptions import OpenMLNotSupportedError + +if TYPE_CHECKING: + from collections.abc import Mapping + from typing import Any + + from openml._api.clients import HTTPClient, MinIOClient + from openml.enums import APIVersion, ResourceType + + +class ResourceAPI(ABC): + api_version: APIVersion + resource_type: ResourceType + + def __init__(self, http: HTTPClient, minio: MinIOClient | None = None): + self._http = http + self._minio = minio + + @abstractmethod + def delete(self, resource_id: int) -> bool: ... + + @abstractmethod + def publish(self, path: str, files: Mapping[str, Any] | None) -> int: ... + + @abstractmethod + def tag(self, resource_id: int, tag: str) -> list[str]: ... + + @abstractmethod + def untag(self, resource_id: int, tag: str) -> list[str]: ... + + def _not_supported(self, *, method: str) -> NoReturn: + version = getattr(self.api_version, "value", "unknown") + resource = getattr(self.resource_type, "value", "unknown") + + raise OpenMLNotSupportedError( + f"{self.__class__.__name__}: " + f"{version} API does not support `{method}` " + f"for resource `{resource}`" + ) diff --git a/openml/_api/resources/base/fallback.py b/openml/_api/resources/base/fallback.py new file mode 100644 index 000000000..3919c36a9 --- /dev/null +++ b/openml/_api/resources/base/fallback.py @@ -0,0 +1,58 @@ +from __future__ import annotations + +from collections.abc import Callable +from typing import Any + +from openml.exceptions import OpenMLNotSupportedError + + +class FallbackProxy: + def __init__(self, *api_versions: Any): + if not api_versions: + raise ValueError("At least one API version must be provided") + self._apis = api_versions + + def __getattr__(self, name: str) -> Any: + api, attr = self._find_attr(name) + if callable(attr): + return self._wrap_callable(name, api, attr) + return attr + + def _find_attr(self, name: str) -> tuple[Any, Any]: + for api in self._apis: + attr = getattr(api, name, None) + if attr is not None: + return api, attr + raise AttributeError(f"{self.__class__.__name__} has no attribute {name}") + + def _wrap_callable( + self, + name: str, + primary_api: Any, + primary_attr: Callable[..., Any], + ) -> Callable[..., Any]: + def wrapper(*args: Any, **kwargs: Any) -> Any: + try: + return primary_attr(*args, **kwargs) + except OpenMLNotSupportedError: + return self._call_fallbacks(name, primary_api, *args, **kwargs) + + return wrapper + + def _call_fallbacks( + self, + name: str, + skip_api: Any, + *args: Any, + **kwargs: Any, + ) -> Any: + for api in self._apis: + if api is skip_api: + continue + attr = getattr(api, name, None) + if callable(attr): + try: + return attr(*args, **kwargs) + except OpenMLNotSupportedError: + continue + raise OpenMLNotSupportedError(f"Could not fallback to any API for method: {name}") diff --git a/openml/_api/resources/base/resources.py b/openml/_api/resources/base/resources.py new file mode 100644 index 000000000..3ef97248b --- /dev/null +++ b/openml/_api/resources/base/resources.py @@ -0,0 +1,151 @@ +from __future__ import annotations + +import builtins +from abc import abstractmethod +from pathlib import Path +from typing import TYPE_CHECKING, Any, Literal + +if TYPE_CHECKING: + import pandas as pd + + from openml.datasets.dataset import OpenMLDataFeature, OpenMLDataset +from openml.enums import ResourceType + +from .base import ResourceAPI + + +class DatasetAPI(ResourceAPI): + resource_type: ResourceType = ResourceType.DATASET + + @abstractmethod + def get( # noqa: PLR0913 + self, + dataset_id: int, + download_data: bool = False, # noqa: FBT002 + cache_format: Literal["pickle", "feather"] = "pickle", + download_qualities: bool = False, # noqa: FBT002 + download_features_meta_data: bool = False, # noqa: FBT002 + download_all_files: bool = False, # noqa: FBT002 + force_refresh_cache: bool = False, # noqa: FBT002 + ) -> OpenMLDataset: ... + + @abstractmethod + def list( + self, + limit: int, + offset: int, + *, + data_id: list[int] | None = None, # type: ignore + **kwargs: Any, + ) -> pd.DataFrame: ... + + @abstractmethod + def edit( # noqa: PLR0913 + self, + data_id: int, + description: str | None = None, + creator: str | None = None, + contributor: str | None = None, + collection_date: str | None = None, + language: str | None = None, + default_target_attribute: str | None = None, + ignore_attribute: str | list[str] | None = None, # type: ignore + citation: str | None = None, + row_id_attribute: str | None = None, + original_data_url: str | None = None, + paper_url: str | None = None, + ) -> int: ... + + @abstractmethod + def fork(self, dataset_id: int) -> int: ... + + @abstractmethod + def status_update(self, dataset_id: int, status: Literal["active", "deactivated"]) -> None: ... + + @abstractmethod + def list_qualities(self) -> builtins.list[str]: ... + + @abstractmethod + def feature_add_ontology(self, dataset_id: int, index: int, ontology: str) -> bool: ... + + @abstractmethod + def feature_remove_ontology(self, dataset_id: int, index: int, ontology: str) -> bool: ... + + @abstractmethod + def get_features(self, dataset_id: int) -> dict[int, OpenMLDataFeature]: ... + + @abstractmethod + def get_qualities(self, dataset_id: int) -> dict[str, float] | None: ... + + @abstractmethod + def parse_features_file( + self, features_file: Path, features_pickle_file: Path + ) -> dict[int, OpenMLDataFeature]: ... + + @abstractmethod + def parse_qualities_file( + self, qualities_file: Path, qualities_pickle_file: Path + ) -> dict[str, float]: ... + + @abstractmethod + def _download_file(self, url_ext: str, file_path: str, encoding: str = "utf-8") -> Path: ... + + @abstractmethod + def download_features_file(self, dataset_id: int) -> Path: ... + + @abstractmethod + def download_qualities_file(self, dataset_id: int) -> Path: ... + + @abstractmethod + def download_dataset_parquet( + self, + description: dict | OpenMLDataset, + download_all_files: bool = False, # noqa: FBT002 + ) -> Path | None: ... + + @abstractmethod + def download_dataset_arff( + self, + description: dict | OpenMLDataset, + ) -> Path: ... + + @abstractmethod + def add_topic(self, data_id: int, topic: str) -> int: ... + + @abstractmethod + def delete_topic(self, data_id: int, topic: str) -> int: ... + + @abstractmethod + def get_online_dataset_format(self, dataset_id: int) -> str: ... + + +class TaskAPI(ResourceAPI): + resource_type: ResourceType = ResourceType.TASK + + +class EvaluationMeasureAPI(ResourceAPI): + resource_type: ResourceType = ResourceType.EVALUATION_MEASURE + + +class EstimationProcedureAPI(ResourceAPI): + resource_type: ResourceType = ResourceType.ESTIMATION_PROCEDURE + + +class EvaluationAPI(ResourceAPI): + resource_type: ResourceType = ResourceType.EVALUATION + + +class FlowAPI(ResourceAPI): + resource_type: ResourceType = ResourceType.FLOW + + +class StudyAPI(ResourceAPI): + resource_type: ResourceType = ResourceType.STUDY + + +class RunAPI(ResourceAPI): + resource_type: ResourceType = ResourceType.RUN + + +class SetupAPI(ResourceAPI): + resource_type: ResourceType = ResourceType.SETUP diff --git a/openml/_api/resources/base/versions.py b/openml/_api/resources/base/versions.py new file mode 100644 index 000000000..b86272377 --- /dev/null +++ b/openml/_api/resources/base/versions.py @@ -0,0 +1,155 @@ +from __future__ import annotations + +from collections.abc import Mapping +from typing import Any, cast + +import xmltodict + +from openml.enums import APIVersion, ResourceType +from openml.exceptions import ( + OpenMLNotAuthorizedError, + OpenMLServerError, + OpenMLServerException, +) + +from .base import ResourceAPI + + +class ResourceV1API(ResourceAPI): + api_version: APIVersion = APIVersion.V1 + + def publish(self, path: str, files: Mapping[str, Any] | None) -> int: + response = self._http.post(path, files=files) + parsed_response = xmltodict.parse(response.content) + return self._extract_id_from_upload(parsed_response) + + def delete(self, resource_id: int) -> bool: + resource_type = self._get_endpoint_name() + + legal_resources = {"data", "flow", "task", "run", "study", "user"} + if resource_type not in legal_resources: + raise ValueError(f"Can't delete a {resource_type}") + + path = f"{resource_type}/{resource_id}" + try: + response = self._http.delete(path) + result = xmltodict.parse(response.content) + return f"oml:{resource_type}_delete" in result + except OpenMLServerException as e: + self._handle_delete_exception(resource_type, e) + raise + + def tag(self, resource_id: int, tag: str) -> list[str]: + resource_type = self._get_endpoint_name() + + legal_resources = {"data", "task", "flow", "setup", "run"} + if resource_type not in legal_resources: + raise ValueError(f"Can't tag a {resource_type}") + + path = f"{resource_type}/tag" + data = {f"{resource_type}_id": resource_id, "tag": tag} + response = self._http.post(path, data=data) + + main_tag = f"oml:{resource_type}_tag" + parsed_response = xmltodict.parse(response.content, force_list={"oml:tag"}) + result = parsed_response[main_tag] + tags: list[str] = result.get("oml:tag", []) + + return tags + + def untag(self, resource_id: int, tag: str) -> list[str]: + resource_type = self._get_endpoint_name() + + legal_resources = {"data", "task", "flow", "setup", "run"} + if resource_type not in legal_resources: + raise ValueError(f"Can't tag a {resource_type}") + + path = f"{resource_type}/untag" + data = {f"{resource_type}_id": resource_id, "tag": tag} + response = self._http.post(path, data=data) + + main_tag = f"oml:{resource_type}_untag" + parsed_response = xmltodict.parse(response.content, force_list={"oml:tag"}) + result = parsed_response[main_tag] + tags: list[str] = result.get("oml:tag", []) + + return tags + + def _get_endpoint_name(self) -> str: + if self.resource_type == ResourceType.DATASET: + return "data" + return cast("str", self.resource_type.value) + + def _handle_delete_exception( + self, resource_type: str, exception: OpenMLServerException + ) -> None: + # https://github.com/openml/OpenML/blob/21f6188d08ac24fcd2df06ab94cf421c946971b0/openml_OS/views/pages/api_new/v1/xml/pre.php + # Most exceptions are descriptive enough to be raised as their standard + # OpenMLServerException, however there are two cases where we add information: + # - a generic "failed" message, we direct them to the right issue board + # - when the user successfully authenticates with the server, + # but user is not allowed to take the requested action, + # in which case we specify a OpenMLNotAuthorizedError. + by_other_user = [323, 353, 393, 453, 594] + has_dependent_entities = [324, 326, 327, 328, 354, 454, 464, 595] + unknown_reason = [325, 355, 394, 455, 593] + if exception.code in by_other_user: + raise OpenMLNotAuthorizedError( + message=( + f"The {resource_type} can not be deleted because it was not uploaded by you." + ), + ) from exception + if exception.code in has_dependent_entities: + raise OpenMLNotAuthorizedError( + message=( + f"The {resource_type} can not be deleted because " + f"it still has associated entities: {exception.message}" + ), + ) from exception + if exception.code in unknown_reason: + raise OpenMLServerError( + message=( + f"The {resource_type} can not be deleted for unknown reason," + " please open an issue at: https://github.com/openml/openml/issues/new" + ), + ) from exception + raise exception + + def _extract_id_from_upload(self, parsed: Mapping[str, Any]) -> int: + # reads id from upload response + # actual parsed dict: {"oml:upload_flow": {"@xmlns:oml": "...", "oml:id": "42"}} + + # xmltodict always gives exactly one root key + ((_, root_value),) = parsed.items() + + if not isinstance(root_value, Mapping): + raise ValueError("Unexpected XML structure") + + # Look for oml:id directly in the root value + if "oml:id" in root_value: + id_value = root_value["oml:id"] + if isinstance(id_value, (str, int)): + return int(id_value) + + # Fallback: check all values for numeric/string IDs + for v in root_value.values(): + if isinstance(v, (str, int)): + return int(v) + + raise ValueError("No ID found in upload response") + + +class ResourceV2API(ResourceAPI): + api_version: APIVersion = APIVersion.V2 + + def publish(self, path: str, files: Mapping[str, Any] | None) -> int: # noqa: ARG002 + self._not_supported(method="publish") + + def delete(self, resource_id: int) -> bool: # noqa: ARG002 + self._not_supported(method="delete") + + def tag(self, resource_id: int, tag: str) -> list[str]: # noqa: ARG002 + self._not_supported(method="tag") + + def untag(self, resource_id: int, tag: str) -> list[str]: # noqa: ARG002 + self._not_supported(method="untag") diff --git a/openml/_api/resources/dataset.py b/openml/_api/resources/dataset.py new file mode 100644 index 000000000..ab68ae107 --- /dev/null +++ b/openml/_api/resources/dataset.py @@ -0,0 +1,1106 @@ +# ruff: noqa: PLR0913 +from __future__ import annotations + +import builtins +import json +import logging +import os +import pickle +from collections import OrderedDict +from pathlib import Path +from typing import TYPE_CHECKING, Any, Literal + +import minio +import urllib3 + +from openml.config import OPENML_SKIP_PARQUET_ENV_VAR +from openml.datasets.data_feature import OpenMLDataFeature +from openml.datasets.dataset import OpenMLDataset +from openml.exceptions import ( + OpenMLHashException, + OpenMLMinioRequiredError, + OpenMLPrivateDatasetError, + OpenMLServerException, +) + +from .base import DatasetAPI, ResourceV1API, ResourceV2API + +if TYPE_CHECKING: + from requests import Response + + +import pandas as pd +import xmltodict + +logger = logging.getLogger(__name__) + + +NO_ACCESS_GRANTED_ERRCODE = 112 + + +class DatasetV1API(ResourceV1API, DatasetAPI): + def get( + self, + dataset_id: int, + download_data: bool = False, # noqa: FBT002 + cache_format: Literal["pickle", "feather"] = "pickle", + download_qualities: bool = False, # noqa: FBT002 + download_features_meta_data: bool = False, # noqa: FBT002 + download_all_files: bool = False, # noqa: FBT002 + force_refresh_cache: bool = False, # noqa: FBT002 + ) -> OpenMLDataset: + path = f"data/{dataset_id}" + try: + response = self._http.get(path, use_cache=True, reset_cache=force_refresh_cache) + xml_content = response.text + description = xmltodict.parse(xml_content)["oml:data_set_description"] + + features_file = None + qualities_file = None + + if download_features_meta_data: + features_file = self.download_features_file(dataset_id) + if download_qualities: + qualities_file = self.download_qualities_file(dataset_id) + + parquet_file = None + skip_parquet = os.environ.get(OPENML_SKIP_PARQUET_ENV_VAR, "false").casefold() == "true" + download_parquet = "oml:parquet_url" in description and not skip_parquet + if download_parquet and (download_data or download_all_files): + try: + parquet_file = self.download_dataset_parquet( + description, + download_all_files=download_all_files, + ) + except urllib3.exceptions.MaxRetryError: + parquet_file = None + + arff_file = None + if parquet_file is None and download_data: + if download_parquet: + logger.warning("Failed to download parquet, fallback on ARFF.") + arff_file = self.download_dataset_arff(description) + except OpenMLServerException as e: + # if there was an exception + # check if the user had access to the dataset + if e.code == NO_ACCESS_GRANTED_ERRCODE: + raise OpenMLPrivateDatasetError(e.message) from None + + raise e + + return self._create_dataset_from_xml( + description, features_file, qualities_file, arff_file, parquet_file, cache_format + ) + + def list( + self, + limit: int, + offset: int, + *, + data_id: builtins.list[int] | None = None, + **kwargs: Any, + ) -> pd.DataFrame: + """ + Perform api call to return a list of all datasets. + + Parameters + ---------- + The arguments that are lists are separated from the single value + ones which are put into the kwargs. + display_errors is also separated from the kwargs since it has a + default value. + + limit : int + The maximum number of datasets to show. + offset : int + The number of datasets to skip, starting from the first. + data_id : list, optional + + kwargs : dict, optional + Legal filter operators (keys in the dict): + tag, status, limit, offset, data_name, data_version, number_instances, + number_features, number_classes, number_missing_values. + + Returns + ------- + datasets : dataframe + """ + api_call = "data/list" + + if limit is not None: + api_call += f"/limit/{limit}" + if offset is not None: + api_call += f"/offset/{offset}" + + if kwargs is not None: + for operator, value in kwargs.items(): + if value is not None: + api_call += f"/{operator}/{value}" + if data_id is not None: + api_call += f"/data_id/{','.join([str(int(i)) for i in data_id])}" + xml_string = self._http.get(api_call).text + return self._parse_list_xml(xml_string) + + def edit( + self, + dataset_id: int, + description: str | None = None, + creator: str | None = None, + contributor: str | None = None, + collection_date: str | None = None, + language: str | None = None, + default_target_attribute: str | None = None, + ignore_attribute: str | builtins.list[str] | None = None, + citation: str | None = None, + row_id_attribute: str | None = None, + original_data_url: str | None = None, + paper_url: str | None = None, + ) -> int: + """Edits an OpenMLDataset. + + In addition to providing the dataset id of the dataset to edit (through dataset_id), + you must specify a value for at least one of the optional function arguments, + i.e. one value for a field to edit. + + This function allows editing of both non-critical and critical fields. + Critical fields are default_target_attribute, ignore_attribute, row_id_attribute. + + - Editing non-critical data fields is allowed for all authenticated users. + - Editing critical fields is allowed only for the owner, provided there are no tasks + associated with this dataset. + + If dataset has tasks or if the user is not the owner, the only way + to edit critical fields is to use fork_dataset followed by edit_dataset. + + Parameters + ---------- + dataset_id : int + ID of the dataset. + description : str + Description of the dataset. + creator : str + The person who created the dataset. + contributor : str + People who contributed to the current version of the dataset. + collection_date : str + The date the data was originally collected, given by the uploader. + language : str + Language in which the data is represented. + Starts with 1 upper case letter, rest lower case, e.g. 'English'. + default_target_attribute : str + The default target attribute, if it exists. + Can have multiple values, comma separated. + ignore_attribute : str | list + Attributes that should be excluded in modelling, + such as identifiers and indexes. + citation : str + Reference(s) that should be cited when building on this data. + row_id_attribute : str, optional + The attribute that represents the row-id column, if present in the + dataset. If ``data`` is a dataframe and ``row_id_attribute`` is not + specified, the index of the dataframe will be used as the + ``row_id_attribute``. If the name of the index is ``None``, it will + be discarded. + + .. versionadded: 0.8 + Inference of ``row_id_attribute`` from a dataframe. + original_data_url : str, optional + For derived data, the url to the original dataset. + paper_url : str, optional + Link to a paper describing the dataset. + + Returns + ------- + Dataset id + """ + # compose data edit parameters as xml + form_data = {"data_id": dataset_id} # type: dict[str, str | int] + xml = OrderedDict() # type: 'OrderedDict[str, OrderedDict]' + xml["oml:data_edit_parameters"] = OrderedDict() + xml["oml:data_edit_parameters"]["@xmlns:oml"] = "http://openml.org/openml" + xml["oml:data_edit_parameters"]["oml:description"] = description + xml["oml:data_edit_parameters"]["oml:creator"] = creator + xml["oml:data_edit_parameters"]["oml:contributor"] = contributor + xml["oml:data_edit_parameters"]["oml:collection_date"] = collection_date + xml["oml:data_edit_parameters"]["oml:language"] = language + xml["oml:data_edit_parameters"]["oml:default_target_attribute"] = default_target_attribute + xml["oml:data_edit_parameters"]["oml:row_id_attribute"] = row_id_attribute + xml["oml:data_edit_parameters"]["oml:ignore_attribute"] = ignore_attribute + xml["oml:data_edit_parameters"]["oml:citation"] = citation + xml["oml:data_edit_parameters"]["oml:original_data_url"] = original_data_url + xml["oml:data_edit_parameters"]["oml:paper_url"] = paper_url + + # delete None inputs + for k in list(xml["oml:data_edit_parameters"]): + if not xml["oml:data_edit_parameters"][k]: + del xml["oml:data_edit_parameters"][k] + + file_elements = { + "edit_parameters": ("description.xml", xmltodict.unparse(xml)), + } # type: dict[str, str | tuple[str, str]] + result_xml = self._http.post("data/edit", data=form_data, files=file_elements).text + result = xmltodict.parse(result_xml) + dataset_id = result["oml:data_edit"]["oml:id"] + return int(dataset_id) + + def fork(self, dataset_id: int) -> int: + """ + Creates a new dataset version, with the authenticated user as the new owner. + The forked dataset can have distinct dataset meta-data, + but the actual data itself is shared with the original version. + + This API is intended for use when a user is unable to edit the critical fields of a dataset + through the edit_dataset API. + (Critical fields are default_target_attribute, ignore_attribute, row_id_attribute.) + + Specifically, this happens when the user is: + 1. Not the owner of the dataset. + 2. User is the owner of the dataset, but the dataset has tasks. + + In these two cases the only way to edit critical fields is: + 1. STEP 1: Fork the dataset using fork_dataset API + 2. STEP 2: Call edit_dataset API on the forked version. + + + Parameters + ---------- + dataset_id : int + id of the dataset to be forked + + Returns + ------- + Dataset id of the forked dataset + + """ + if not isinstance(dataset_id, int): + raise TypeError(f"`dataset_id` must be of type `int`, not {type(dataset_id)}.") + # compose data fork parameters + form_data = {"data_id": dataset_id} + result_xml = self._http.post("data/fork", data=form_data).text + result = xmltodict.parse(result_xml) + dataset_id = result["oml:data_fork"]["oml:id"] + return int(dataset_id) + + def status_update(self, dataset_id: int, status: Literal["active", "deactivated"]) -> None: + """ + Updates the status of a dataset to either 'active' or 'deactivated'. + Please see the OpenML API documentation for a description of the status + and all legal status transitions: + https://docs.openml.org/concepts/data/#dataset-status + + Parameters + ---------- + dataset_id : int + The data id of the dataset + status : str, + 'active' or 'deactivated' + """ + legal_status = {"active", "deactivated"} + if status not in legal_status: + raise ValueError(f"Illegal status value. Legal values: {legal_status}") + + data: dict[str, str | int] = {"data_id": dataset_id, "status": status} + result_xml = self._http.post("data/status/update", data=data).text + result = xmltodict.parse(result_xml) + server_data_id = result["oml:data_status_update"]["oml:id"] + server_status = result["oml:data_status_update"]["oml:status"] + if status != server_status or int(dataset_id) != int(server_data_id): + # This should never happen + raise ValueError("Data id/status does not collide") + + def list_qualities(self) -> builtins.list[str]: + """Return list of data qualities available. + + The function performs an API call to retrieve the entire list of + data qualities that are computed on the datasets uploaded. + + Returns + ------- + list + """ + api_call = "data/qualities/list" + xml_string = self._http.get(api_call).text + qualities = xmltodict.parse(xml_string, force_list=("oml:quality")) + # Minimalistic check if the XML is useful + if "oml:data_qualities_list" not in qualities: + raise ValueError('Error in return XML, does not contain "oml:data_qualities_list"') + + if not isinstance(qualities["oml:data_qualities_list"]["oml:quality"], list): + raise TypeError('Error in return XML, does not contain "oml:quality" as a list') + + return qualities["oml:data_qualities_list"]["oml:quality"] + + def _create_dataset_from_xml( + self, + description: dict, + features_file: Path | None = None, + qualities_file: Path | None = None, + arff_file: Path | None = None, + parquet_file: Path | None = None, + cache_format: Literal["pickle", "feather"] = "pickle", + ) -> OpenMLDataset: + """Create a dataset given a xml string. + + Parameters + ---------- + xml : string + Dataset xml representation. + + Returns + ------- + OpenMLDataset + """ + return OpenMLDataset( + description["oml:name"], + description.get("oml:description"), + data_format=description["oml:format"], + dataset_id=int(description["oml:id"]), + version=int(description["oml:version"]), + creator=description.get("oml:creator"), + contributor=description.get("oml:contributor"), + collection_date=description.get("oml:collection_date"), + upload_date=description.get("oml:upload_date"), + language=description.get("oml:language"), + licence=description.get("oml:licence"), + url=description["oml:url"], + default_target_attribute=description.get("oml:default_target_attribute"), + row_id_attribute=description.get("oml:row_id_attribute"), + ignore_attribute=description.get("oml:ignore_attribute"), + version_label=description.get("oml:version_label"), + citation=description.get("oml:citation"), + tag=description.get("oml:tag"), + cache_format=cache_format, + visibility=description.get("oml:visibility"), + original_data_url=description.get("oml:original_data_url"), + paper_url=description.get("oml:paper_url"), + update_comment=description.get("oml:update_comment"), + md5_checksum=description.get("oml:md5_checksum"), + data_file=str(arff_file) if arff_file is not None else None, + features_file=str(features_file) if features_file is not None else None, + qualities_file=str(qualities_file) if qualities_file is not None else None, + parquet_url=description.get("oml:parquet_url"), + parquet_file=str(parquet_file) if parquet_file is not None else None, + ) + + def feature_add_ontology(self, dataset_id: int, index: int, ontology: str) -> bool: + """ + An ontology describes the concept that are described in a feature. An + ontology is defined by an URL where the information is provided. Adds + an ontology (URL) to a given dataset feature (defined by a dataset id + and index). The dataset has to exists on OpenML and needs to have been + processed by the evaluation engine. + + Parameters + ---------- + dataset_id : int + id of the dataset to which the feature belongs + index : int + index of the feature in dataset (0-based) + ontology : str + URL to ontology (max. 256 characters) + + Returns + ------- + True or throws an OpenML server exception + """ + upload_data: dict[str, int | str] = { + "data_id": dataset_id, + "index": index, + "ontology": ontology, + } + self._http.post("data/feature/ontology/add", data=upload_data) + # an error will be thrown in case the request was unsuccessful + return True + + def feature_remove_ontology(self, dataset_id: int, index: int, ontology: str) -> bool: + """ + Removes an existing ontology (URL) from a given dataset feature (defined + by a dataset id and index). The dataset has to exists on OpenML and needs + to have been processed by the evaluation engine. Ontology needs to be + attached to the specific fearure. + + Parameters + ---------- + dataset_id : int + id of the dataset to which the feature belongs + index : int + index of the feature in dataset (0-based) + ontology : str + URL to ontology (max. 256 characters) + + Returns + ------- + True or throws an OpenML server exception + """ + upload_data: dict[str, int | str] = { + "data_id": dataset_id, + "index": index, + "ontology": ontology, + } + self._http.post("data/feature/ontology/remove", data=upload_data) + # an error will be thrown in case the request was unsuccessful + return True + + def get_features(self, dataset_id: int) -> dict[int, OpenMLDataFeature]: + path = f"data/features/{dataset_id}" + xml = self._http.get(path, use_cache=True).text + + return self._parse_features_xml(xml) + + def get_qualities(self, dataset_id: int) -> dict[str, float] | None: + path = f"data/qualities/{dataset_id!s}" + try: + xml = self._http.get(path, use_cache=True).text + except OpenMLServerException as e: + if e.code == 362 and str(e) == "No qualities found - None": + # quality file stays as None + logger.warning(f"No qualities found for dataset {dataset_id}") + return None + + raise e + + return self._parse_qualities_xml(xml) + + def parse_features_file( + self, features_file: Path, features_pickle_file: Path | None = None + ) -> dict[int, OpenMLDataFeature]: + if features_pickle_file is None: + features_pickle_file = features_file.with_suffix(features_file.suffix + ".pkl") + if features_file.suffix != ".xml": + # TODO (Shrivaths) can only parse xml warn/ raise exception + raise NotImplementedError() + + with Path(features_file).open("r", encoding="utf8") as fh: + features_xml = fh.read() + + features = self._parse_features_xml(features_xml) + + with features_pickle_file.open("wb") as fh_binary: + pickle.dump(features, fh_binary) + + return features + + def parse_qualities_file( + self, qualities_file: Path, qualities_pickle_file: Path | None = None + ) -> dict[str, float]: + if qualities_pickle_file is None: + qualities_pickle_file = qualities_file.with_suffix(qualities_file.suffix + ".pkl") + if qualities_file.suffix != ".xml": + # TODO (Shrivaths) can only parse xml warn/ raise exception + raise NotImplementedError() + + with Path(qualities_file).open("r", encoding="utf8") as fh: + qualities_xml = fh.read() + + qualities = self._parse_qualities_xml(qualities_xml) + + with qualities_pickle_file.open("wb") as fh_binary: + pickle.dump(qualities, fh_binary) + + return qualities + + def _parse_features_xml(self, features_xml_string: str) -> dict[int, OpenMLDataFeature]: + xml_dict = xmltodict.parse( + features_xml_string, + force_list=("oml:feature", "oml:nominal_value"), + strip_whitespace=False, + ) + features_xml = xml_dict["oml:data_features"] + + features: dict[int, OpenMLDataFeature] = {} + for idx, xmlfeature in enumerate(features_xml["oml:feature"]): + nr_missing = xmlfeature.get("oml:number_of_missing_values", 0) + feature = OpenMLDataFeature( + int(xmlfeature["oml:index"]), + xmlfeature["oml:name"], + xmlfeature["oml:data_type"], + xmlfeature.get("oml:nominal_value"), + int(nr_missing), + xmlfeature.get("oml:ontology"), + ) + if idx != feature.index: + raise ValueError("Data features not provided in right order") + features[feature.index] = feature + + return features + + def _parse_qualities_xml(self, qualities_xml: str) -> dict[str, float]: + xml_as_dict = xmltodict.parse(qualities_xml, force_list=("oml:quality",)) + qualities = xml_as_dict["oml:data_qualities"]["oml:quality"] + qualities_ = {} + for xmlquality in qualities: + name = xmlquality["oml:name"] + if xmlquality.get("oml:value", None) is None or xmlquality["oml:value"] == "null": + value = float("NaN") + else: + value = float(xmlquality["oml:value"]) + qualities_[name] = value + return qualities_ + + def _parse_list_xml(self, xml_string: str) -> pd.DataFrame: + datasets_dict = xmltodict.parse(xml_string, force_list=("oml:dataset",)) + # Minimalistic check if the XML is useful + assert isinstance(datasets_dict["oml:data"]["oml:dataset"], list), type( + datasets_dict["oml:data"], + ) + assert datasets_dict["oml:data"]["@xmlns:oml"] == "http://openml.org/openml", datasets_dict[ + "oml:data" + ]["@xmlns:oml"] + + datasets = {} + for dataset_ in datasets_dict["oml:data"]["oml:dataset"]: + ignore_attribute = ["oml:file_id", "oml:quality"] + dataset = { + k.replace("oml:", ""): v for (k, v) in dataset_.items() if k not in ignore_attribute + } + dataset["did"] = int(dataset["did"]) + dataset["version"] = int(dataset["version"]) + + # The number of qualities can range from 0 to infinity + for quality in dataset_.get("oml:quality", []): + try: + dataset[quality["@name"]] = int(quality["#text"]) + except ValueError: + dataset[quality["@name"]] = float(quality["#text"]) + datasets[dataset["did"]] = dataset + + return pd.DataFrame.from_dict(datasets, orient="index").astype( + { + "did": int, + "version": int, + "status": pd.CategoricalDtype(["active", "deactivated", "in_preparation"]), + } + ) + + def _download_file(self, url_ext: str, file_path: str, encoding: str = "utf-8") -> Path: + def __handler(response: Response, path: Path, encoding: str) -> Path: + with path.open("w", encoding=encoding) as f: + f.write(response.text) + return path + + return self._http.download(url_ext, __handler, encoding, file_path) + + def download_features_file(self, dataset_id: int) -> Path: + path = f"data/features/{dataset_id}" + file = self._download_file(path, "features.xml") + _ = self.parse_features_file(file) + return file + + def download_qualities_file(self, dataset_id: int) -> Path: + path = f"data/qualities/{dataset_id}" + file = self._download_file(path, "qualities.xml") + _ = self.parse_qualities_file(file) + return file + + def download_dataset_parquet( + self, + description: dict | OpenMLDataset, + download_all_files: bool = False, # noqa: FBT002 + ) -> Path | None: + if self._minio is None: + raise OpenMLMinioRequiredError( + "A minio object is required for Dataset, but none was provided" + ) + if isinstance(description, dict): + url = str(description.get("oml:parquet_url")) + elif isinstance(description, OpenMLDataset): + url = str(description._parquet_url) + assert description.dataset_id is not None + else: + raise TypeError("`description` should be either OpenMLDataset or Dict.") + + if download_all_files: + self._minio.download_minio_bucket(source=url) + + try: + output_file_path = self._minio.download_minio_file( + source=url, + ) + except (FileNotFoundError, urllib3.exceptions.MaxRetryError, minio.error.ServerError) as e: + logger.warning(f"Could not download file from {url}: {e}") + return None + return output_file_path + + def download_dataset_arff( + self, + description: dict | OpenMLDataset, + ) -> Path: + if isinstance(description, dict): + md5_checksum_fixture = description.get("oml:md5_checksum") + url = str(description["oml:url"]) + did = int(description.get("oml:id")) # type: ignore + elif isinstance(description, OpenMLDataset): + md5_checksum_fixture = description.md5_checksum + assert description.url is not None + assert description.dataset_id is not None + + url = description.url + did = int(description.dataset_id) + else: + raise TypeError("`description` should be either OpenMLDataset or Dict.") + + try: + output_file_path = self._http.download( + url, file_name="dataset.arff", md5_checksum=md5_checksum_fixture + ) + except OpenMLHashException as e: + additional_info = f" Raised when downloading dataset {did}." + e.args = (e.args[0] + additional_info,) + raise e + + return output_file_path + + def add_topic(self, data_id: int, topic: str) -> int: + form_data = {"data_id": data_id, "topic": topic} # type: dict[str, str | int] + result_xml = self._http.post("data/topicadd", data=form_data) + result = xmltodict.parse(result_xml) + data_id = result["oml:data_topic"]["oml:id"] + return int(data_id) + + def delete_topic(self, data_id: int, topic: str) -> int: + form_data = {"data_id": data_id, "topic": topic} # type: dict[str, str | int] + result_xml = self._http.post("data/topicdelete", data=form_data) + result = xmltodict.parse(result_xml) + data_id = result["oml:data_topic"]["oml:id"] + return int(data_id) + + def get_online_dataset_format(self, dataset_id: int) -> str: + dataset_xml = self._http.get(f"data/{dataset_id}") + # build a dict from the xml and get the format from the dataset description + return xmltodict.parse(dataset_xml)["oml:data_set_description"]["oml:format"].lower() # type: ignore + + def get_online_dataset_arff(self, dataset_id: int) -> str | None: + dataset_xml = self._http.get(f"data/{dataset_id}") + # build a dict from the xml. + # use the url from the dataset description and return the ARFF string + return str(self.download_dataset_arff(xmltodict.parse(dataset_xml))) + + +class DatasetV2API(ResourceV2API, DatasetAPI): + def get( + self, + dataset_id: int, + download_data: bool = False, # noqa: FBT002 + cache_format: Literal["pickle", "feather"] = "pickle", + download_qualities: bool = False, # noqa: FBT002 + download_features_meta_data: bool = False, # noqa: FBT002 + download_all_files: bool = False, # noqa: FBT002 + force_refresh_cache: bool = False, # noqa: FBT002 + ) -> OpenMLDataset: + path = f"datasets/{dataset_id}" + try: + response = self._http.get(path, use_cache=True, reset_cache=force_refresh_cache) + json_content = response.json() + features_file = None + qualities_file = None + + if download_features_meta_data: + features_file = self.download_features_file(dataset_id) + if download_qualities: + qualities_file = self.download_qualities_file(dataset_id) + + parquet_file = None + skip_parquet = os.environ.get(OPENML_SKIP_PARQUET_ENV_VAR, "false").casefold() == "true" + download_parquet = "parquet_url" in json_content and not skip_parquet + if download_parquet and (download_data or download_all_files): + try: + parquet_file = self.download_dataset_parquet( + json_content, + download_all_files=download_all_files, + ) + except urllib3.exceptions.MaxRetryError: + parquet_file = None + + arff_file = None + if parquet_file is None and download_data: + if download_parquet: + logger.warning("Failed to download parquet, fallback on ARFF.") + arff_file = self.download_dataset_arff(json_content) + except OpenMLServerException as e: + # if there was an exception + # check if the user had access to the dataset + if e.code == NO_ACCESS_GRANTED_ERRCODE: + raise OpenMLPrivateDatasetError(e.message) from None + + raise e + + return self._create_dataset_from_json( + json_content, features_file, qualities_file, arff_file, parquet_file, cache_format + ) + + def list( + self, + limit: int, + offset: int, + *, + data_id: builtins.list[int] | None = None, + **kwargs: Any, + ) -> pd.DataFrame: + """ + Perform api call to return a list of all datasets. + + Parameters + ---------- + The arguments that are lists are separated from the single value + ones which are put into the kwargs. + display_errors is also separated from the kwargs since it has a + default value. + + limit : int + The maximum number of datasets to show. + offset : int + The number of datasets to skip, starting from the first. + dataset_id: list[int], optional + + kwargs : dict, optional + Legal filter operators (keys in the dict): + tag, status, limit, offset, data_name, data_version, number_instances, + number_features, number_classes, number_missing_values. + + Returns + ------- + datasets : dataframe + """ + json: dict[str, Any] = {"pagination": {}} + + if limit is not None: + json["pagination"]["limit"] = limit + if offset is not None: + json["pagination"]["offset"] = offset + if data_id is not None: + json["data_id"] = data_id + if kwargs is not None: + for operator, value in kwargs.items(): + if value is not None: + json[operator] = value + + api_call = "datasets/list" + datasets_list = self._http.post(path=api_call, json=json, use_api_key=False).json() + # Minimalistic check if the JSON is useful + assert isinstance(datasets_list, list), type(datasets_list) + + return self._parse_list_json(datasets_list) + + def edit( + self, + dataset_id: int, + description: str | None = None, + creator: str | None = None, + contributor: str | None = None, + collection_date: str | None = None, + language: str | None = None, + default_target_attribute: str | None = None, + ignore_attribute: str | builtins.list[str] | None = None, + citation: str | None = None, + row_id_attribute: str | None = None, + original_data_url: str | None = None, + paper_url: str | None = None, + ) -> int: + raise NotImplementedError(self._not_supported(method="edit")) + + def fork(self, dataset_id: int) -> int: + raise NotImplementedError(self._not_supported(method="fork")) + + def status_update(self, dataset_id: int, status: Literal["active", "deactivated"]) -> None: + """ + Updates the status of a dataset to either 'active' or 'deactivated'. + Please see the OpenML API documentation for a description of the status + and all legal status transitions: + https://docs.openml.org/concepts/data/#dataset-status + + Parameters + ---------- + dataset_id : int + The data id of the dataset + status : str, + 'active' or 'deactivated' + """ + legal_status = {"active", "deactivated"} + if status not in legal_status: + raise ValueError(f"Illegal status value. Legal values: {legal_status}") + + data: dict[str, str | int] = {"dataset_id": dataset_id, "status": status} + result = self._http.post("datasets/status/update", json=data).json() + server_data_id = result["dataset_id"] + server_status = result["status"] + if status != server_status or int(dataset_id) != int(server_data_id): + # This should never happen + raise ValueError("Data id/status does not collide") + + def list_qualities(self) -> builtins.list[str]: + """Return list of data qualities available. + + The function performs an API call to retrieve the entire list of + data qualities that are computed on the datasets uploaded. + + Returns + ------- + list + """ + api_call = "datasets/qualities/list" + qualities = self._http.get(api_call).json() + # Minimalistic check if the XML is useful + if "data_qualities_list" not in qualities: + raise ValueError('Error in return XML, does not contain "oml:data_qualities_list"') + + if not isinstance(qualities["data_qualities_list"]["quality"], list): + raise TypeError('Error in return json, does not contain "quality" as a list') + + return qualities["data_qualities_list"]["quality"] + + def _create_dataset_from_json( + self, + json_content: dict, + features_file: Path | None = None, + qualities_file: Path | None = None, + arff_file: Path | None = None, + parquet_file: Path | None = None, + cache_format: Literal["pickle", "feather"] = "pickle", + ) -> OpenMLDataset: + """Create a dataset given a json. + + Parameters + ---------- + json_content : dict + Dataset dict/json representation. + + Returns + ------- + OpenMLDataset + """ + return OpenMLDataset( + json_content["name"], + json_content.get("description"), + data_format=json_content["format"], + dataset_id=int(json_content["id"]), + version=int(json_content["version"]), + creator=json_content.get("creator"), + contributor=json_content.get("contributor"), + collection_date=json_content.get("collection_date"), + upload_date=json_content.get("upload_date"), + language=json_content.get("language"), + licence=json_content.get("licence"), + url=json_content["url"], + default_target_attribute=json_content.get("default_target_attribute"), + row_id_attribute=json_content.get("row_id_attribute"), + ignore_attribute=json_content.get("ignore_attribute"), + version_label=json_content.get("version_label"), + citation=json_content.get("citation"), + tag=json_content.get("tag"), + cache_format=cache_format, + visibility=json_content.get("visibility"), + original_data_url=json_content.get("original_data_url"), + paper_url=json_content.get("paper_url"), + update_comment=json_content.get("update_comment"), + md5_checksum=json_content.get("md5_checksum"), + data_file=str(arff_file) if arff_file is not None else None, + features_file=str(features_file) if features_file is not None else None, + qualities_file=str(qualities_file) if qualities_file is not None else None, + parquet_url=json_content.get("parquet_url"), + parquet_file=str(parquet_file) if parquet_file is not None else None, + ) + + def feature_add_ontology(self, dataset_id: int, index: int, ontology: str) -> bool: + raise NotImplementedError(self._not_supported(method="feature_add_ontology")) + + def feature_remove_ontology(self, dataset_id: int, index: int, ontology: str) -> bool: + raise NotImplementedError(self._not_supported(method="feature_remove_ontology")) + + def get_features(self, dataset_id: int) -> dict[int, OpenMLDataFeature]: + path = f"datasets/features/{dataset_id}" + json = self._http.get(path, use_cache=True).json() + + return self._parse_features_json(json) + + def get_qualities(self, dataset_id: int) -> dict[str, float] | None: + path = f"datasets/qualities/{dataset_id!s}" + try: + qualities_json = self._http.get(path, use_cache=True).json() + except OpenMLServerException as e: + if e.code == 362 and str(e) == "No qualities found - None": + logger.warning(f"No qualities found for dataset {dataset_id}") + return None + + raise e + + return self._parse_qualities_json(qualities_json) + + def parse_features_file( + self, features_file: Path, features_pickle_file: Path | None = None + ) -> dict[int, OpenMLDataFeature]: + if features_pickle_file is None: + features_pickle_file = features_file.with_suffix(features_file.suffix + ".pkl") + if features_file.suffix == ".xml": + # can fallback to v1 if the file is .xml + raise NotImplementedError() + + with Path(features_file).open("r", encoding="utf8") as fh: + features_json = json.load(fh) + + features = self._parse_features_json(features_json) + + with features_pickle_file.open("wb") as fh_binary: + pickle.dump(features, fh_binary) + + return features + + def parse_qualities_file( + self, qualities_file: Path, qualities_pickle_file: Path | None = None + ) -> dict[str, float]: + if qualities_pickle_file is None: + qualities_pickle_file = qualities_file.with_suffix(qualities_file.suffix + ".pkl") + if qualities_file.suffix == ".xml": + # can fallback to v1 if the file is .xml + raise NotImplementedError() + + with Path(qualities_file).open("r", encoding="utf8") as fh: + qualities_json = json.load(fh) + + qualities = self._parse_qualities_json(qualities_json) + + with qualities_pickle_file.open("wb") as fh_binary: + pickle.dump(qualities, fh_binary) + + return qualities + + def _parse_features_json(self, features_json: dict) -> dict[int, OpenMLDataFeature]: + features: dict[int, OpenMLDataFeature] = {} + for idx, jsonfeatures in enumerate(features_json): + nr_missing = jsonfeatures.get("number_of_missing_values", 0) + feature = OpenMLDataFeature( + int(jsonfeatures["index"]), + jsonfeatures["name"], + jsonfeatures["data_type"], + jsonfeatures.get("nominal_values"), + int(nr_missing), + jsonfeatures.get("ontology"), + ) + if idx != feature.index: + raise ValueError("Data features not provided in right order") + features[feature.index] = feature + + return features + + def _parse_qualities_json(self, qualities_json: dict) -> dict[str, float]: + qualities_ = {} + for quality in qualities_json: + name = quality["name"] + if quality.get("value", None) is None or quality["value"] == "null": + value = float("NaN") + else: + value = float(quality["value"]) + qualities_[name] = value + return qualities_ + + def _parse_list_json(self, datasets_list: builtins.list) -> pd.DataFrame: + datasets = {} + for dataset_ in datasets_list: + ignore_attribute = ["file_id", "quality"] + dataset = {k: v for (k, v) in dataset_.items() if k not in ignore_attribute} + dataset["did"] = int(dataset["did"]) + dataset["version"] = int(dataset["version"]) + + # The number of qualities can range from 0 to infinity + for quality in dataset_.get("quality", []): + try: + dataset[quality["name"]] = int(quality["value"]) + except ValueError: + dataset[quality["name"]] = float(quality["value"]) + datasets[dataset["did"]] = dataset + + return pd.DataFrame.from_dict(datasets, orient="index").astype( + { + "did": int, + "version": int, + "status": pd.CategoricalDtype(["active", "deactivated", "in_preparation"]), + } + ) + + def _download_file(self, url_ext: str, file_name: str, encoding: str = "utf-8") -> Path: + def __handler(response: Response, path: Path, encoding: str) -> Path: + with path.open("w", encoding=encoding) as f: + json.dump(response.json(), f, indent=4) + return path + + return self._http.download(url_ext, __handler, encoding, file_name) + + def download_features_file(self, dataset_id: int) -> Path: + path = f"datasets/features/{dataset_id}" + file = self._download_file(path, "features.json") + _ = self.parse_features_file(file) + return file + + def download_qualities_file(self, dataset_id: int) -> Path: + path = f"datasets/qualities/{dataset_id}" + file = self._download_file(path, "qualities.json") + _ = self.parse_qualities_file(file) + return file + + def download_dataset_parquet( + self, + description: dict | OpenMLDataset, + download_all_files: bool = False, # noqa: FBT002 + ) -> Path | None: + if self._minio is None: + raise OpenMLMinioRequiredError( + "A minio object is required for Dataset, but none was provided" + ) + if isinstance(description, dict): + url = str(description.get("parquet_url")) + elif isinstance(description, OpenMLDataset): + url = str(description._parquet_url) + assert description.dataset_id is not None + else: + raise TypeError("`description` should be either OpenMLDataset or Dict.") + + if download_all_files: + self._minio.download_minio_bucket(source=url) + + try: + output_file_path = self._minio.download_minio_file(source=url) + except (FileNotFoundError, urllib3.exceptions.MaxRetryError, minio.error.ServerError) as e: + logger.warning(f"Could not download file from {url}: {e}") + return None + return output_file_path + + def download_dataset_arff( + self, + description: dict | OpenMLDataset, + ) -> Path: + if isinstance(description, dict): + url = str(description["url"]) + did = int(description.get("id")) # type: ignore + elif isinstance(description, OpenMLDataset): + assert description.url is not None + assert description.dataset_id is not None + + url = description.url + did = int(description.dataset_id) + else: + raise TypeError("`description` should be either OpenMLDataset or Dict.") + + try: + output_file_path = self._http.download(url, file_name="dataset.arff") + except OpenMLHashException as e: + additional_info = f" Raised when downloading dataset {did}." + e.args = (e.args[0] + additional_info,) + raise e + + return output_file_path + + def add_topic(self, data_id: int, topic: str) -> int: + raise NotImplementedError(self._not_supported(method="add_topic")) + + def delete_topic(self, data_id: int, topic: str) -> int: + raise NotImplementedError(self._not_supported(method="delete_topic")) + + def get_online_dataset_format(self, dataset_id: int) -> str: + dataset_json = self._http.get(f"datasets/{dataset_id}").text + # build a dict from the xml and get the format from the dataset description + return dataset_json["data_set_description"]["format"].lower() # type: ignore + + def get_online_dataset_arff(self, dataset_id: int) -> str | None: + dataset_json = self._http.get(f"datasets/{dataset_id}").json() + # build a dict from the xml. + # use the url from the dataset description and return the ARFF string + return str(self.download_dataset_arff(dataset_json)) diff --git a/openml/_api/resources/estimation_procedure.py b/openml/_api/resources/estimation_procedure.py new file mode 100644 index 000000000..b8ea7d2c3 --- /dev/null +++ b/openml/_api/resources/estimation_procedure.py @@ -0,0 +1,11 @@ +from __future__ import annotations + +from .base import EstimationProcedureAPI, ResourceV1API, ResourceV2API + + +class EstimationProcedureV1API(ResourceV1API, EstimationProcedureAPI): + pass + + +class EstimationProcedureV2API(ResourceV2API, EstimationProcedureAPI): + pass diff --git a/openml/_api/resources/evaluation.py b/openml/_api/resources/evaluation.py new file mode 100644 index 000000000..07877e14e --- /dev/null +++ b/openml/_api/resources/evaluation.py @@ -0,0 +1,11 @@ +from __future__ import annotations + +from .base import EvaluationAPI, ResourceV1API, ResourceV2API + + +class EvaluationV1API(ResourceV1API, EvaluationAPI): + pass + + +class EvaluationV2API(ResourceV2API, EvaluationAPI): + pass diff --git a/openml/_api/resources/evaluation_measure.py b/openml/_api/resources/evaluation_measure.py new file mode 100644 index 000000000..63cf16c77 --- /dev/null +++ b/openml/_api/resources/evaluation_measure.py @@ -0,0 +1,11 @@ +from __future__ import annotations + +from .base import EvaluationMeasureAPI, ResourceV1API, ResourceV2API + + +class EvaluationMeasureV1API(ResourceV1API, EvaluationMeasureAPI): + pass + + +class EvaluationMeasureV2API(ResourceV2API, EvaluationMeasureAPI): + pass diff --git a/openml/_api/resources/flow.py b/openml/_api/resources/flow.py new file mode 100644 index 000000000..ad2e05bd9 --- /dev/null +++ b/openml/_api/resources/flow.py @@ -0,0 +1,11 @@ +from __future__ import annotations + +from .base import FlowAPI, ResourceV1API, ResourceV2API + + +class FlowV1API(ResourceV1API, FlowAPI): + pass + + +class FlowV2API(ResourceV2API, FlowAPI): + pass diff --git a/openml/_api/resources/run.py b/openml/_api/resources/run.py new file mode 100644 index 000000000..151c69e35 --- /dev/null +++ b/openml/_api/resources/run.py @@ -0,0 +1,11 @@ +from __future__ import annotations + +from .base import ResourceV1API, ResourceV2API, RunAPI + + +class RunV1API(ResourceV1API, RunAPI): + pass + + +class RunV2API(ResourceV2API, RunAPI): + pass diff --git a/openml/_api/resources/setup.py b/openml/_api/resources/setup.py new file mode 100644 index 000000000..78a36cecc --- /dev/null +++ b/openml/_api/resources/setup.py @@ -0,0 +1,11 @@ +from __future__ import annotations + +from .base import ResourceV1API, ResourceV2API, SetupAPI + + +class SetupV1API(ResourceV1API, SetupAPI): + pass + + +class SetupV2API(ResourceV2API, SetupAPI): + pass diff --git a/openml/_api/resources/study.py b/openml/_api/resources/study.py new file mode 100644 index 000000000..cefd55004 --- /dev/null +++ b/openml/_api/resources/study.py @@ -0,0 +1,11 @@ +from __future__ import annotations + +from .base import ResourceV1API, ResourceV2API, StudyAPI + + +class StudyV1API(ResourceV1API, StudyAPI): + pass + + +class StudyV2API(ResourceV2API, StudyAPI): + pass diff --git a/openml/_api/resources/task.py b/openml/_api/resources/task.py new file mode 100644 index 000000000..a367c9aa1 --- /dev/null +++ b/openml/_api/resources/task.py @@ -0,0 +1,11 @@ +from __future__ import annotations + +from .base import ResourceV1API, ResourceV2API, TaskAPI + + +class TaskV1API(ResourceV1API, TaskAPI): + pass + + +class TaskV2API(ResourceV2API, TaskAPI): + pass diff --git a/openml/_api/setup/__init__.py b/openml/_api/setup/__init__.py new file mode 100644 index 000000000..1c28cfa9e --- /dev/null +++ b/openml/_api/setup/__init__.py @@ -0,0 +1,14 @@ +from ._instance import _backend +from .backend import APIBackend +from .builder import APIBackendBuilder +from .config import APIConfig, CacheConfig, Config, ConnectionConfig + +__all__ = [ + "APIBackend", + "APIBackendBuilder", + "APIConfig", + "CacheConfig", + "Config", + "ConnectionConfig", + "_backend", +] diff --git a/openml/_api/setup/_instance.py b/openml/_api/setup/_instance.py new file mode 100644 index 000000000..c98ccaf57 --- /dev/null +++ b/openml/_api/setup/_instance.py @@ -0,0 +1,5 @@ +from __future__ import annotations + +from .backend import APIBackend + +_backend = APIBackend.get_instance() diff --git a/openml/_api/setup/_utils.py b/openml/_api/setup/_utils.py new file mode 100644 index 000000000..ddcf5b41c --- /dev/null +++ b/openml/_api/setup/_utils.py @@ -0,0 +1,49 @@ +from __future__ import annotations + +import logging +import os +import platform +from pathlib import Path + +openml_logger = logging.getLogger("openml") + +# Default values (see also https://github.com/openml/OpenML/wiki/Client-API-Standards) +_user_path = Path("~").expanduser().absolute() + + +def _resolve_default_cache_dir() -> Path: + user_defined_cache_dir = os.environ.get("OPENML_CACHE_DIR") + if user_defined_cache_dir is not None: + return Path(user_defined_cache_dir) + + if platform.system().lower() != "linux": + return _user_path / ".openml" + + xdg_cache_home = os.environ.get("XDG_CACHE_HOME") + if xdg_cache_home is None: + return Path("~", ".cache", "openml") + + # This is the proper XDG_CACHE_HOME directory, but + # we unfortunately had a problem where we used XDG_CACHE_HOME/org, + # we check heuristically if this old directory still exists and issue + # a warning if it does. There's too much data to move to do this for the user. + + # The new cache directory exists + cache_dir = Path(xdg_cache_home) / "openml" + if cache_dir.exists(): + return cache_dir + + # The old cache directory *does not* exist + heuristic_dir_for_backwards_compat = Path(xdg_cache_home) / "org" / "openml" + if not heuristic_dir_for_backwards_compat.exists(): + return cache_dir + + root_dir_to_delete = Path(xdg_cache_home) / "org" + openml_logger.warning( + "An old cache directory was found at '%s'. This directory is no longer used by " + "OpenML-Python. To silence this warning you would need to delete the old cache " + "directory. The cached files will then be located in '%s'.", + root_dir_to_delete, + cache_dir, + ) + return Path(xdg_cache_home) diff --git a/openml/_api/setup/backend.py b/openml/_api/setup/backend.py new file mode 100644 index 000000000..f0faf5165 --- /dev/null +++ b/openml/_api/setup/backend.py @@ -0,0 +1,62 @@ +from __future__ import annotations + +from copy import deepcopy +from typing import Any + +from .builder import APIBackendBuilder +from .config import Config + + +class APIBackend: + _instance: APIBackend | None = None + + def __init__(self, config: Config | None = None): + self._config: Config = config or Config() + self._backend = APIBackendBuilder.build(self._config) + + def __getattr__(self, name: str) -> Any: + """ + Delegate attribute access to the underlying backend. + Called only if attribute is not found on RuntimeBackend. + """ + return getattr(self._backend, name) + + @classmethod + def get_instance(cls) -> APIBackend: + if cls._instance is None: + cls._instance = cls() + return cls._instance + + @classmethod + def get_config(cls) -> Config: + return deepcopy(cls.get_instance()._config) + + @classmethod + def set_config(cls, config: Config) -> None: + instance = cls.get_instance() + instance._config = config + instance._backend = APIBackendBuilder.build(config) + + @classmethod + def get_config_value(cls, key: str) -> Config: + keys = key.split(".") + config_value = cls.get_instance()._config + for k in keys: + if isinstance(config_value, dict): + config_value = config_value[k] + else: + config_value = getattr(config_value, k) + return deepcopy(config_value) + + @classmethod + def set_config_value(cls, key: str, value: Any) -> None: + keys = key.split(".") + config = cls.get_instance()._config + parent = config + for k in keys[:-1]: + parent = parent[k] if isinstance(parent, dict) else getattr(parent, k) + if isinstance(parent, dict): + parent[keys[-1]] = value + else: + setattr(parent, keys[-1], value) + cls.set_config(config) diff --git a/openml/_api/setup/builder.py b/openml/_api/setup/builder.py new file mode 100644 index 000000000..750db431a --- /dev/null +++ b/openml/_api/setup/builder.py @@ -0,0 +1,71 @@ +from __future__ import annotations + +from collections.abc import Mapping +from pathlib import Path +from typing import TYPE_CHECKING + +from openml._api.clients import HTTPCache, HTTPClient, MinIOClient +from openml._api.resources import API_REGISTRY, FallbackProxy, ResourceAPI + +if TYPE_CHECKING: + from openml.enums import ResourceType + + from .config import Config + + +class APIBackendBuilder: + def __init__( + self, + resource_apis: Mapping[ResourceType, ResourceAPI | FallbackProxy], + ): + for resource_type, resource_api in resource_apis.items(): + setattr(self, resource_type.value, resource_api) + + @classmethod + def build(cls, config: Config) -> APIBackendBuilder: + cache_dir = Path(config.cache.dir).expanduser() + + http_cache = HTTPCache(path=cache_dir, ttl=config.cache.ttl) + minio_client = MinIOClient(path=cache_dir) + + primary_api_config = config.api_configs[config.api_version] + primary_http_client = HTTPClient( + server=primary_api_config.server, + base_url=primary_api_config.base_url, + api_key=primary_api_config.api_key, + timeout=config.connection.timeout, + retries=config.connection.retries, + retry_policy=config.connection.retry_policy, + cache=http_cache, + ) + + resource_apis: dict[ResourceType, ResourceAPI] = {} + for resource_type, resource_api_cls in API_REGISTRY[config.api_version].items(): + resource_apis[resource_type] = resource_api_cls(primary_http_client, minio_client) + + if config.fallback_api_version is None: + return cls(resource_apis) + + fallback_api_config = config.api_configs[config.fallback_api_version] + fallback_http_client = HTTPClient( + server=fallback_api_config.server, + base_url=fallback_api_config.base_url, + api_key=fallback_api_config.api_key, + timeout=config.connection.timeout, + retries=config.connection.retries, + retry_policy=config.connection.retry_policy, + cache=http_cache, + ) + + fallback_resource_apis: dict[ResourceType, ResourceAPI] = {} + for resource_type, resource_api_cls in API_REGISTRY[config.fallback_api_version].items(): + fallback_resource_apis[resource_type] = resource_api_cls( + fallback_http_client, minio_client + ) + + merged: dict[ResourceType, FallbackProxy] = { + name: FallbackProxy(resource_apis[name], fallback_resource_apis[name]) + for name in resource_apis + } + + return cls(merged) diff --git a/openml/_api/setup/config.py b/openml/_api/setup/config.py new file mode 100644 index 000000000..ea868262a --- /dev/null +++ b/openml/_api/setup/config.py @@ -0,0 +1,63 @@ +from __future__ import annotations + +from dataclasses import dataclass, field + +from openml.enums import APIVersion, RetryPolicy + +from ._utils import _resolve_default_cache_dir + + +@dataclass +class APIConfig: + server: str + base_url: str + api_key: str + + +@dataclass +class ConnectionConfig: + retries: int + retry_policy: RetryPolicy + timeout: int + + +@dataclass +class CacheConfig: + dir: str + ttl: int + + +@dataclass +class Config: + api_version: APIVersion = APIVersion.V1 + fallback_api_version: APIVersion | None = None + + api_configs: dict[APIVersion, APIConfig] = field( + default_factory=lambda: { + APIVersion.V1: APIConfig( + server="https://www.openml.org/", + base_url="api/v1/xml/", + api_key="", + ), + APIVersion.V2: APIConfig( + server="http://localhost:8002/", + base_url="", + api_key="", + ), + } + ) + + connection: ConnectionConfig = field( + default_factory=lambda: ConnectionConfig( + retries=5, + retry_policy=RetryPolicy.HUMAN, + timeout=10, + ) + ) + + cache: CacheConfig = field( + default_factory=lambda: CacheConfig( + dir=str(_resolve_default_cache_dir()), + ttl=60 * 60 * 24 * 7, + ) + ) diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index d9eee278d..4517f55b6 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -9,14 +9,14 @@ import warnings from collections.abc import Iterable, Sequence from pathlib import Path -from typing import Any, Literal +from typing import Any, Literal, cast import arff import numpy as np import pandas as pd import scipy.sparse -import xmltodict +import openml from openml.base import OpenMLBase from openml.config import OPENML_SKIP_PARQUET_ENV_VAR @@ -807,7 +807,6 @@ def _load_features(self) -> None: """Load the features metadata from the server and store it in the dataset object.""" # Delayed Import to avoid circular imports or having to import all of dataset.functions to # import OpenMLDataset. - from openml.datasets.functions import _get_dataset_features_file if self.dataset_id is None: raise ValueError( @@ -815,13 +814,11 @@ def _load_features(self) -> None: "metadata.", ) - features_file = _get_dataset_features_file(None, self.dataset_id) - self._features = _read_features(features_file) + self._features = openml._backend.dataset.get_features(self.dataset_id) def _load_qualities(self) -> None: """Load qualities information from the server and store it in the dataset object.""" # same reason as above for _load_features - from openml.datasets.functions import _get_dataset_qualities_file if self.dataset_id is None: raise ValueError( @@ -829,12 +826,12 @@ def _load_qualities(self) -> None: "metadata.", ) - qualities_file = _get_dataset_qualities_file(None, self.dataset_id) + qualities = openml._backend.dataset.get_qualities(self.dataset_id) - if qualities_file is None: + if qualities is None: self._no_qualities_found = True else: - self._qualities = _read_qualities(qualities_file) + self._qualities = qualities def retrieve_class_labels(self, target_name: str = "class") -> None | list[str]: """Reads the datasets arff to determine the class-labels. @@ -994,48 +991,23 @@ def _to_dict(self) -> dict[str, dict]: } -def _read_features(features_file: Path) -> dict[int, OpenMLDataFeature]: +def _read_features(features_file: str | Path) -> dict[int, OpenMLDataFeature]: + features_file = Path(features_file) features_pickle_file = Path(_get_features_pickle_file(str(features_file))) try: with features_pickle_file.open("rb") as fh_binary: return pickle.load(fh_binary) # type: ignore # noqa: S301 - except: # noqa: E722 - with Path(features_file).open("r", encoding="utf8") as fh: - features_xml_string = fh.read() - - features = _parse_features_xml(features_xml_string) - + except FileNotFoundError: + features = cast( + "dict[int, OpenMLDataFeature]", + openml._backend.dataset.parse_features_file(features_file, features_pickle_file), + ) with features_pickle_file.open("wb") as fh_binary: pickle.dump(features, fh_binary) - return features -def _parse_features_xml(features_xml_string: str) -> dict[int, OpenMLDataFeature]: - xml_dict = xmltodict.parse( - features_xml_string, force_list=("oml:feature", "oml:nominal_value"), strip_whitespace=False - ) - features_xml = xml_dict["oml:data_features"] - - features: dict[int, OpenMLDataFeature] = {} - for idx, xmlfeature in enumerate(features_xml["oml:feature"]): - nr_missing = xmlfeature.get("oml:number_of_missing_values", 0) - feature = OpenMLDataFeature( - int(xmlfeature["oml:index"]), - xmlfeature["oml:name"], - xmlfeature["oml:data_type"], - xmlfeature.get("oml:nominal_value"), - int(nr_missing), - xmlfeature.get("oml:ontology"), - ) - if idx != feature.index: - raise ValueError("Data features not provided in right order") - features[feature.index] = feature - - return features - - # TODO(eddiebergman): Should this really exist? def _get_features_pickle_file(features_file: str) -> str: """Exists so it can be mocked during unit testing""" @@ -1055,29 +1027,10 @@ def _read_qualities(qualities_file: str | Path) -> dict[str, float]: with qualities_pickle_file.open("rb") as fh_binary: return pickle.load(fh_binary) # type: ignore # noqa: S301 except: # noqa: E722 - with qualities_file.open(encoding="utf8") as fh: - qualities_xml = fh.read() - - qualities = _parse_qualities_xml(qualities_xml) + qualities = cast( + "dict[str, float]", + openml._backend.dataset.parse_qualities_file(qualities_file, qualities_pickle_file), + ) with qualities_pickle_file.open("wb") as fh_binary: pickle.dump(qualities, fh_binary) - return qualities - - -def _check_qualities(qualities: list[dict[str, str]]) -> dict[str, float]: - qualities_ = {} - for xmlquality in qualities: - name = xmlquality["oml:name"] - if xmlquality.get("oml:value", None) is None or xmlquality["oml:value"] == "null": - value = float("NaN") - else: - value = float(xmlquality["oml:value"]) - qualities_[name] = value - return qualities_ - - -def _parse_qualities_xml(qualities_xml: str) -> dict[str, float]: - xml_as_dict = xmltodict.parse(qualities_xml, force_list=("oml:quality",)) - qualities = xml_as_dict["oml:data_qualities"]["oml:quality"] - return _check_qualities(qualities) diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index 3ac657ea0..63217ffac 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -3,35 +3,25 @@ from __future__ import annotations import logging -import os import warnings -from collections import OrderedDict from functools import partial from pathlib import Path from pyexpat import ExpatError -from typing import TYPE_CHECKING, Any, Literal +from typing import TYPE_CHECKING, Any, Literal, cast import arff -import minio.error import numpy as np import pandas as pd -import urllib3 import xmltodict from scipy.sparse import coo_matrix import openml._api_calls import openml.utils -from openml.config import OPENML_SKIP_PARQUET_ENV_VAR from openml.exceptions import ( - OpenMLHashException, - OpenMLPrivateDatasetError, OpenMLServerError, - OpenMLServerException, ) from openml.utils import ( _create_cache_directory_for_id, - _get_cache_dir_for_id, - _remove_cache_dir_for_id, ) from .dataset import OpenMLDataset @@ -39,6 +29,7 @@ if TYPE_CHECKING: import scipy + DATASETS_CACHE_DIR_NAME = "datasets" logger = logging.getLogger(__name__) @@ -64,17 +55,7 @@ def list_qualities() -> list[str]: ------- list """ - api_call = "data/qualities/list" - xml_string = openml._api_calls._perform_api_call(api_call, "get") - qualities = xmltodict.parse(xml_string, force_list=("oml:quality")) - # Minimalistic check if the XML is useful - if "oml:data_qualities_list" not in qualities: - raise ValueError('Error in return XML, does not contain "oml:data_qualities_list"') - - if not isinstance(qualities["oml:data_qualities_list"]["oml:quality"], list): - raise TypeError('Error in return XML, does not contain "oml:quality" as a list') - - return qualities["oml:data_qualities_list"]["oml:quality"] + return cast("list[str]", openml._backend.dataset.list_qualities()) def list_datasets( @@ -128,7 +109,7 @@ def list_datasets( these are also included as columns. """ listing_call = partial( - _list_datasets, + cast("pd.DataFrame", openml._backend.dataset.list), data_id=data_id, status=status, tag=tag, @@ -146,92 +127,6 @@ def list_datasets( return pd.concat(batches) -def _list_datasets( - limit: int, - offset: int, - *, - data_id: list[int] | None = None, - **kwargs: Any, -) -> pd.DataFrame: - """ - Perform api call to return a list of all datasets. - - Parameters - ---------- - The arguments that are lists are separated from the single value - ones which are put into the kwargs. - display_errors is also separated from the kwargs since it has a - default value. - - limit : int - The maximum number of datasets to show. - offset : int - The number of datasets to skip, starting from the first. - data_id : list, optional - - kwargs : dict, optional - Legal filter operators (keys in the dict): - tag, status, limit, offset, data_name, data_version, number_instances, - number_features, number_classes, number_missing_values. - - Returns - ------- - datasets : dataframe - """ - api_call = "data/list" - - if limit is not None: - api_call += f"/limit/{limit}" - if offset is not None: - api_call += f"/offset/{offset}" - - if kwargs is not None: - for operator, value in kwargs.items(): - if value is not None: - api_call += f"/{operator}/{value}" - if data_id is not None: - api_call += f"/data_id/{','.join([str(int(i)) for i in data_id])}" - return __list_datasets(api_call=api_call) - - -def __list_datasets(api_call: str) -> pd.DataFrame: - xml_string = openml._api_calls._perform_api_call(api_call, "get") - datasets_dict = xmltodict.parse(xml_string, force_list=("oml:dataset",)) - - # Minimalistic check if the XML is useful - assert isinstance(datasets_dict["oml:data"]["oml:dataset"], list), type( - datasets_dict["oml:data"], - ) - assert datasets_dict["oml:data"]["@xmlns:oml"] == "http://openml.org/openml", datasets_dict[ - "oml:data" - ]["@xmlns:oml"] - - datasets = {} - for dataset_ in datasets_dict["oml:data"]["oml:dataset"]: - ignore_attribute = ["oml:file_id", "oml:quality"] - dataset = { - k.replace("oml:", ""): v for (k, v) in dataset_.items() if k not in ignore_attribute - } - dataset["did"] = int(dataset["did"]) - dataset["version"] = int(dataset["version"]) - - # The number of qualities can range from 0 to infinity - for quality in dataset_.get("oml:quality", []): - try: - dataset[quality["@name"]] = int(quality["#text"]) - except ValueError: - dataset[quality["@name"]] = float(quality["#text"]) - datasets[dataset["did"]] = dataset - - return pd.DataFrame.from_dict(datasets, orient="index").astype( - { - "did": int, - "version": int, - "status": pd.CategoricalDtype(["active", "deactivated", "in_preparation"]), - } - ) - - def _expand_parameter(parameter: str | list[str] | None) -> list[str]: expanded_parameter = [] if isinstance(parameter, str): @@ -374,7 +269,7 @@ def get_datasets( @openml.utils.thread_safe_if_oslo_installed -def get_dataset( # noqa: C901, PLR0912 +def get_dataset( dataset_id: int | str, download_data: bool = False, # noqa: FBT002 version: int | None = None, @@ -470,64 +365,17 @@ def get_dataset( # noqa: C901, PLR0912 f"`dataset_id` must be one of `str` or `int`, not {type(dataset_id)}.", ) - if force_refresh_cache: - did_cache_dir = _get_cache_dir_for_id(DATASETS_CACHE_DIR_NAME, dataset_id) - if did_cache_dir.exists(): - _remove_cache_dir_for_id(DATASETS_CACHE_DIR_NAME, did_cache_dir) - - did_cache_dir = _create_cache_directory_for_id( - DATASETS_CACHE_DIR_NAME, - dataset_id, - ) - - remove_dataset_cache = True - try: - description = _get_dataset_description(did_cache_dir, dataset_id) - features_file = None - qualities_file = None - - if download_features_meta_data: - features_file = _get_dataset_features_file(did_cache_dir, dataset_id) - if download_qualities: - qualities_file = _get_dataset_qualities_file(did_cache_dir, dataset_id) - - parquet_file = None - skip_parquet = os.environ.get(OPENML_SKIP_PARQUET_ENV_VAR, "false").casefold() == "true" - download_parquet = "oml:parquet_url" in description and not skip_parquet - if download_parquet and (download_data or download_all_files): - try: - parquet_file = _get_dataset_parquet( - description, - download_all_files=download_all_files, - ) - except urllib3.exceptions.MaxRetryError: - parquet_file = None - - arff_file = None - if parquet_file is None and download_data: - if download_parquet: - logger.warning("Failed to download parquet, fallback on ARFF.") - arff_file = _get_dataset_arff(description) - - remove_dataset_cache = False - except OpenMLServerException as e: - # if there was an exception - # check if the user had access to the dataset - if e.code == NO_ACCESS_GRANTED_ERRCODE: - raise OpenMLPrivateDatasetError(e.message) from None - - raise e - finally: - if remove_dataset_cache: - _remove_cache_dir_for_id(DATASETS_CACHE_DIR_NAME, did_cache_dir) - - return _create_dataset_from_description( - description, - features_file, - qualities_file, - arff_file, - parquet_file, - cache_format, + return cast( + "OpenMLDataset", + openml._backend.dataset.get( + dataset_id, + download_data, + cache_format, + download_qualities, + download_features_meta_data, + download_all_files, + force_refresh_cache, + ), ) @@ -807,14 +655,7 @@ def status_update(data_id: int, status: Literal["active", "deactivated"]) -> Non if status not in legal_status: raise ValueError(f"Illegal status value. Legal values: {legal_status}") - data: openml._api_calls.DATA_TYPE = {"data_id": data_id, "status": status} - result_xml = openml._api_calls._perform_api_call("data/status/update", "post", data=data) - result = xmltodict.parse(result_xml) - server_data_id = result["oml:data_status_update"]["oml:id"] - server_status = result["oml:data_status_update"]["oml:status"] - if status != server_status or int(data_id) != int(server_data_id): - # This should never happen - raise ValueError("Data id/status does not collide") + openml._backend.dataset.status_update(dataset_id=data_id, status=status) def edit_dataset( @@ -891,40 +732,23 @@ def edit_dataset( if not isinstance(data_id, int): raise TypeError(f"`data_id` must be of type `int`, not {type(data_id)}.") - # compose data edit parameters as xml - form_data = {"data_id": data_id} # type: openml._api_calls.DATA_TYPE - xml = OrderedDict() # type: 'OrderedDict[str, OrderedDict]' - xml["oml:data_edit_parameters"] = OrderedDict() - xml["oml:data_edit_parameters"]["@xmlns:oml"] = "http://openml.org/openml" - xml["oml:data_edit_parameters"]["oml:description"] = description - xml["oml:data_edit_parameters"]["oml:creator"] = creator - xml["oml:data_edit_parameters"]["oml:contributor"] = contributor - xml["oml:data_edit_parameters"]["oml:collection_date"] = collection_date - xml["oml:data_edit_parameters"]["oml:language"] = language - xml["oml:data_edit_parameters"]["oml:default_target_attribute"] = default_target_attribute - xml["oml:data_edit_parameters"]["oml:row_id_attribute"] = row_id_attribute - xml["oml:data_edit_parameters"]["oml:ignore_attribute"] = ignore_attribute - xml["oml:data_edit_parameters"]["oml:citation"] = citation - xml["oml:data_edit_parameters"]["oml:original_data_url"] = original_data_url - xml["oml:data_edit_parameters"]["oml:paper_url"] = paper_url - - # delete None inputs - for k in list(xml["oml:data_edit_parameters"]): - if not xml["oml:data_edit_parameters"][k]: - del xml["oml:data_edit_parameters"][k] - - file_elements = { - "edit_parameters": ("description.xml", xmltodict.unparse(xml)), - } # type: openml._api_calls.FILE_ELEMENTS_TYPE - result_xml = openml._api_calls._perform_api_call( - "data/edit", - "post", - data=form_data, - file_elements=file_elements, + return cast( + "int", + openml._backend.dataset.edit( + data_id, + description, + creator, + contributor, + collection_date, + language, + default_target_attribute, + ignore_attribute, + citation, + row_id_attribute, + original_data_url, + paper_url, + ), ) - result = xmltodict.parse(result_xml) - data_id = result["oml:data_edit"]["oml:id"] - return int(data_id) def fork_dataset(data_id: int) -> int: @@ -956,14 +780,7 @@ def fork_dataset(data_id: int) -> int: Dataset id of the forked dataset """ - if not isinstance(data_id, int): - raise TypeError(f"`data_id` must be of type `int`, not {type(data_id)}.") - # compose data fork parameters - form_data = {"data_id": data_id} # type: openml._api_calls.DATA_TYPE - result_xml = openml._api_calls._perform_api_call("data/fork", "post", data=form_data) - result = xmltodict.parse(result_xml) - data_id = result["oml:data_fork"]["oml:id"] - return int(data_id) + return cast("int", openml._backend.dataset.fork(dataset_id=data_id)) def data_feature_add_ontology(data_id: int, index: int, ontology: str) -> bool: @@ -987,10 +804,7 @@ def data_feature_add_ontology(data_id: int, index: int, ontology: str) -> bool: ------- True or throws an OpenML server exception """ - upload_data: dict[str, int | str] = {"data_id": data_id, "index": index, "ontology": ontology} - openml._api_calls._perform_api_call("data/feature/ontology/add", "post", data=upload_data) - # an error will be thrown in case the request was unsuccessful - return True + return cast("bool", openml._backend.dataset.feature_add_ontology(data_id, index, ontology)) def data_feature_remove_ontology(data_id: int, index: int, ontology: str) -> bool: @@ -1013,12 +827,10 @@ def data_feature_remove_ontology(data_id: int, index: int, ontology: str) -> boo ------- True or throws an OpenML server exception """ - upload_data: dict[str, int | str] = {"data_id": data_id, "index": index, "ontology": ontology} - openml._api_calls._perform_api_call("data/feature/ontology/remove", "post", data=upload_data) - # an error will be thrown in case the request was unsuccessful - return True + return cast("bool", openml._backend.dataset.feature_remove_ontology(data_id, index, ontology)) +# TODO used only in tests def _topic_add_dataset(data_id: int, topic: str) -> int: """ Adds a topic for a dataset. @@ -1035,15 +847,10 @@ def _topic_add_dataset(data_id: int, topic: str) -> int: ------- Dataset id """ - if not isinstance(data_id, int): - raise TypeError(f"`data_id` must be of type `int`, not {type(data_id)}.") - form_data = {"data_id": data_id, "topic": topic} # type: openml._api_calls.DATA_TYPE - result_xml = openml._api_calls._perform_api_call("data/topicadd", "post", data=form_data) - result = xmltodict.parse(result_xml) - data_id = result["oml:data_topic"]["oml:id"] - return int(data_id) + return cast("int", openml._backend.dataset.add_topic(data_id, topic)) +# TODO used only in tests def _topic_delete_dataset(data_id: int, topic: str) -> int: """ Removes a topic from a dataset. @@ -1060,15 +867,10 @@ def _topic_delete_dataset(data_id: int, topic: str) -> int: ------- Dataset id """ - if not isinstance(data_id, int): - raise TypeError(f"`data_id` must be of type `int`, not {type(data_id)}.") - form_data = {"data_id": data_id, "topic": topic} # type: openml._api_calls.DATA_TYPE - result_xml = openml._api_calls._perform_api_call("data/topicdelete", "post", data=form_data) - result = xmltodict.parse(result_xml) - data_id = result["oml:data_topic"]["oml:id"] - return int(data_id) + return cast("int", openml._backend.dataset.delete_topic(data_id, topic)) +# TODO used by tests only def _get_dataset_description(did_cache_dir: Path, dataset_id: int) -> dict[str, Any]: """Get the dataset description as xml dictionary. @@ -1114,7 +916,6 @@ def _get_dataset_description(did_cache_dir: Path, dataset_id: int) -> dict[str, def _get_dataset_parquet( description: dict | OpenMLDataset, - cache_directory: Path | None = None, download_all_files: bool = False, # noqa: FBT002 ) -> Path | None: """Return the path to the local parquet file of the dataset. If is not cached, it is downloaded. @@ -1144,47 +945,14 @@ def _get_dataset_parquet( output_filename : Path, optional Location of the Parquet file if successfully downloaded, None otherwise. """ - if isinstance(description, dict): - url = str(description.get("oml:parquet_url")) - did = int(description.get("oml:id")) # type: ignore - elif isinstance(description, OpenMLDataset): - url = str(description._parquet_url) - assert description.dataset_id is not None - - did = int(description.dataset_id) - else: - raise TypeError("`description` should be either OpenMLDataset or Dict.") - - if cache_directory is None: - cache_directory = _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, did) - - output_file_path = cache_directory / f"dataset_{did}.pq" - - old_file_path = cache_directory / "dataset.pq" - if old_file_path.is_file(): - old_file_path.rename(output_file_path) - - # The call below skips files already on disk, so avoids downloading the parquet file twice. - # To force the old behavior of always downloading everything, use `force_refresh_cache` - # of `get_dataset` - if download_all_files: - openml._api_calls._download_minio_bucket(source=url, destination=cache_directory) - - if not output_file_path.is_file(): - try: - openml._api_calls._download_minio_file( - source=url, - destination=output_file_path, - ) - except (FileNotFoundError, urllib3.exceptions.MaxRetryError, minio.error.ServerError) as e: - logger.warning(f"Could not download file from {url}: {e}") - return None - return output_file_path + return cast( + "Path | None", + openml._backend.dataset.download_dataset_parquet(description, download_all_files), + ) def _get_dataset_arff( description: dict | OpenMLDataset, - cache_directory: Path | None = None, ) -> Path: """Return the path to the local arff file of the dataset. If is not cached, it is downloaded. @@ -1208,47 +976,12 @@ def _get_dataset_arff( output_filename : Path Location of ARFF file. """ - if isinstance(description, dict): - md5_checksum_fixture = description.get("oml:md5_checksum") - url = str(description["oml:url"]) - did = int(description.get("oml:id")) # type: ignore - elif isinstance(description, OpenMLDataset): - md5_checksum_fixture = description.md5_checksum - assert description.url is not None - assert description.dataset_id is not None - - url = description.url - did = int(description.dataset_id) - else: - raise TypeError("`description` should be either OpenMLDataset or Dict.") - - save_cache_directory = ( - _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, did) - if cache_directory is None - else Path(cache_directory) - ) - output_file_path = save_cache_directory / "dataset.arff" - - try: - openml._api_calls._download_text_file( - source=url, - output_path=output_file_path, - md5_checksum=md5_checksum_fixture, - ) - except OpenMLHashException as e: - additional_info = f" Raised when downloading dataset {did}." - e.args = (e.args[0] + additional_info,) - raise e - - return output_file_path + return cast("Path", openml._backend.dataset.download_dataset_arff(description)) -def _get_features_xml(dataset_id: int) -> str: - url_extension = f"data/features/{dataset_id}" - return openml._api_calls._perform_api_call(url_extension, "get") - - -def _get_dataset_features_file(did_cache_dir: str | Path | None, dataset_id: int) -> Path: +def _get_dataset_features_file( + dataset_id: int, +) -> Path: """API call to load dataset features. Loads from cache or downloads them. Features are feature descriptions for each column. @@ -1269,28 +1002,11 @@ def _get_dataset_features_file(did_cache_dir: str | Path | None, dataset_id: int Path Path of the cached dataset feature file """ - did_cache_dir = Path(did_cache_dir) if did_cache_dir is not None else None - if did_cache_dir is None: - did_cache_dir = _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, dataset_id) - - features_file = did_cache_dir / "features.xml" - - # Dataset features aren't subject to change... - if not features_file.is_file(): - features_xml = _get_features_xml(dataset_id) - with features_file.open("w", encoding="utf8") as fh: - fh.write(features_xml) - - return features_file - - -def _get_qualities_xml(dataset_id: int) -> str: - url_extension = f"data/qualities/{dataset_id!s}" - return openml._api_calls._perform_api_call(url_extension, "get") + return cast("Path", openml._backend.dataset.download_features_file(dataset_id)) +# TODO remove cache dir def _get_dataset_qualities_file( - did_cache_dir: str | Path | None, dataset_id: int, ) -> Path | None: """Get the path for the dataset qualities file, or None if no qualities exist. @@ -1313,96 +1029,10 @@ def _get_dataset_qualities_file( str Path of the cached qualities file """ - save_did_cache_dir = ( - _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, dataset_id) - if did_cache_dir is None - else Path(did_cache_dir) - ) - - # Dataset qualities are subject to change and must be fetched every time - qualities_file = save_did_cache_dir / "qualities.xml" - try: - with qualities_file.open(encoding="utf8") as fh: - qualities_xml = fh.read() - except OSError: - try: - qualities_xml = _get_qualities_xml(dataset_id) - with qualities_file.open("w", encoding="utf8") as fh: - fh.write(qualities_xml) - except OpenMLServerException as e: - if e.code == 362 and str(e) == "No qualities found - None": - # quality file stays as None - logger.warning(f"No qualities found for dataset {dataset_id}") - return None - - raise e - - return qualities_file - - -def _create_dataset_from_description( - description: dict[str, str], - features_file: Path | None = None, - qualities_file: Path | None = None, - arff_file: Path | None = None, - parquet_file: Path | None = None, - cache_format: Literal["pickle", "feather"] = "pickle", -) -> OpenMLDataset: - """Create a dataset object from a description dict. - - Parameters - ---------- - description : dict - Description of a dataset in xml dict. - features_file : str - Path of the dataset features as xml file. - qualities_file : list - Path of the dataset qualities as xml file. - arff_file : string, optional - Path of dataset ARFF file. - parquet_file : string, optional - Path of dataset Parquet file. - cache_format: string, optional - Caching option for datasets (feather/pickle) - - Returns - ------- - dataset : dataset object - Dataset object from dict and ARFF. - """ - return OpenMLDataset( - description["oml:name"], - description.get("oml:description"), - data_format=description["oml:format"], # type: ignore - dataset_id=int(description["oml:id"]), - version=int(description["oml:version"]), - creator=description.get("oml:creator"), - contributor=description.get("oml:contributor"), - collection_date=description.get("oml:collection_date"), - upload_date=description.get("oml:upload_date"), - language=description.get("oml:language"), - licence=description.get("oml:licence"), - url=description["oml:url"], - default_target_attribute=description.get("oml:default_target_attribute"), - row_id_attribute=description.get("oml:row_id_attribute"), - ignore_attribute=description.get("oml:ignore_attribute"), - version_label=description.get("oml:version_label"), - citation=description.get("oml:citation"), - tag=description.get("oml:tag"), - visibility=description.get("oml:visibility"), - original_data_url=description.get("oml:original_data_url"), - paper_url=description.get("oml:paper_url"), - update_comment=description.get("oml:update_comment"), - md5_checksum=description.get("oml:md5_checksum"), - data_file=str(arff_file) if arff_file is not None else None, - cache_format=cache_format, - features_file=str(features_file) if features_file is not None else None, - qualities_file=str(qualities_file) if qualities_file is not None else None, - parquet_url=description.get("oml:parquet_url"), - parquet_file=str(parquet_file) if parquet_file is not None else None, - ) + return cast("Path | None", openml._backend.dataset.download_qualities_file(dataset_id)) +# TODO used only in tests def _get_online_dataset_arff(dataset_id: int) -> str | None: """Download the ARFF file for a given dataset id from the OpenML website. @@ -1417,14 +1047,10 @@ def _get_online_dataset_arff(dataset_id: int) -> str | None: str or None A string representation of an ARFF file. Or None if file already exists. """ - dataset_xml = openml._api_calls._perform_api_call(f"data/{dataset_id}", "get") - # build a dict from the xml. - # use the url from the dataset description and return the ARFF string - return openml._api_calls._download_text_file( - xmltodict.parse(dataset_xml)["oml:data_set_description"]["oml:url"], - ) + return cast("str | None", openml._backend.dataset.get_online_dataset_arff(dataset_id)) +# TODO used only in tests def _get_online_dataset_format(dataset_id: int) -> str: """Get the dataset format for a given dataset id from the OpenML website. @@ -1438,9 +1064,7 @@ def _get_online_dataset_format(dataset_id: int) -> str: str Dataset format. """ - dataset_xml = openml._api_calls._perform_api_call(f"data/{dataset_id}", "get") - # build a dict from the xml and get the format from the dataset description - return xmltodict.parse(dataset_xml)["oml:data_set_description"]["oml:format"].lower() # type: ignore + return cast("str", openml._backend.dataset.get_online_dataset_format(dataset_id)) def delete_dataset(dataset_id: int) -> bool: @@ -1459,4 +1083,4 @@ def delete_dataset(dataset_id: int) -> bool: bool True if the deletion was successful. False otherwise. """ - return openml.utils._delete_entity("data", dataset_id) + return cast("bool", openml._backend.dataset.delete(dataset_id)) diff --git a/openml/enums.py b/openml/enums.py new file mode 100644 index 000000000..f5a4381b7 --- /dev/null +++ b/openml/enums.py @@ -0,0 +1,33 @@ +from __future__ import annotations + +from enum import Enum + + +class APIVersion(str, Enum): + """Supported OpenML API versions.""" + + V1 = "v1" + V2 = "v2" + + +class ResourceType(str, Enum): + """Canonical resource types exposed by the OpenML API.""" + + DATASET = "dataset" + TASK = "task" + TASK_TYPE = "task_type" + EVALUATION_MEASURE = "evaluation_measure" + ESTIMATION_PROCEDURE = "estimation_procedure" + EVALUATION = "evaluation" + FLOW = "flow" + STUDY = "study" + RUN = "run" + SETUP = "setup" + USER = "user" + + +class RetryPolicy(str, Enum): + """Retry behavior for failed API requests.""" + + HUMAN = "human" + ROBOT = "robot" diff --git a/openml/exceptions.py b/openml/exceptions.py index fe63b8a58..e1a4937f7 100644 --- a/openml/exceptions.py +++ b/openml/exceptions.py @@ -65,3 +65,15 @@ class OpenMLNotAuthorizedError(OpenMLServerError): class ObjectNotPublishedError(PyOpenMLError): """Indicates an object has not been published yet.""" + + +class OpenMLNotSupportedError(PyOpenMLError): + """Raised when an API operation is not supported for a resource/version.""" + + +class OpenMLCacheRequiredError(PyOpenMLError): + """Raised when a cache object is required but not provided.""" + + +class OpenMLMinioRequiredError(PyOpenMLError): + """Raised when a minio object is required but not provided""" diff --git a/openml/testing.py b/openml/testing.py index 8d3bbbd5b..495ef9b4b 100644 --- a/openml/testing.py +++ b/openml/testing.py @@ -11,10 +11,13 @@ import unittest from pathlib import Path from typing import ClassVar +from urllib.parse import urljoin import requests import openml +from openml._api import HTTPCache, HTTPClient +from openml.enums import RetryPolicy from openml.exceptions import OpenMLServerException from openml.tasks import TaskType @@ -276,6 +279,91 @@ def _check_fold_timing_evaluations( # noqa: PLR0913 assert evaluation <= max_val +class TestAPIBase(unittest.TestCase): + server: str + base_url: str + api_key: str + timeout: int + retries: int + retry_policy: RetryPolicy + dir: str + ttl: int + cache: HTTPCache + http_client: HTTPClient + + def setUp(self) -> None: + self.server = "https://test.openml.org/" + self.base_url = "api/v1/xml" + self.api_key = "normaluser" + self.timeout = 10 + self.retries = 3 + self.retry_policy = RetryPolicy.HUMAN + self.dir = "~/.openml/test_cache" + self.ttl = 60 * 60 * 24 * 7 + + self.cache = self._get_http_cache( + path=Path(self.dir), + ttl=self.ttl, + ) + self.http_client = self._get_http_client( + server=self.server, + base_url=self.base_url, + api_key=self.api_key, + timeout=self.timeout, + retries=self.retries, + retry_policy=self.retry_policy, + cache=self.cache, + ) + + if self.cache.path.exists(): + shutil.rmtree(self.cache.path) + + def tearDown(self) -> None: + if self.cache.path.exists(): + shutil.rmtree(self.cache.path) + + def _get_http_cache( + self, + path: Path, + ttl: int, + ) -> HTTPCache: + return HTTPCache( + path=path, + ttl=ttl, + ) + + def _get_http_client( # noqa: PLR0913 + self, + server: str, + base_url: str, + api_key: str, + timeout: int, + retries: int, + retry_policy: RetryPolicy, + cache: HTTPCache | None = None, + ) -> HTTPClient: + return HTTPClient( + server=server, + base_url=base_url, + api_key=api_key, + timeout=timeout, + retries=retries, + retry_policy=retry_policy, + cache=cache, + ) + + def _get_url( + self, + server: str | None = None, + base_url: str | None = None, + path: str | None = None, + ) -> str: + server = server if server else self.server + base_url = base_url if base_url else self.base_url + path = path if path else "" + return urljoin(self.server, urljoin(self.base_url, path)) + + def check_task_existence( task_type: TaskType, dataset_id: int, diff --git a/tests/test_api/__init__.py b/tests/test_api/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/test_api/test_datasets.py b/tests/test_api/test_datasets.py new file mode 100644 index 000000000..71dac2220 --- /dev/null +++ b/tests/test_api/test_datasets.py @@ -0,0 +1,228 @@ +from __future__ import annotations +from pathlib import Path + +import pytest +import pandas as pd +from typing import TYPE_CHECKING +from openml._api.clients.minio import MinIOClient +from openml._api.resources.base.fallback import FallbackProxy +from openml.testing import TestAPIBase +from openml._api.resources.dataset import DatasetV1API, DatasetV2API + + +class TestDatasetV1API(TestAPIBase): + def setUp(self): + #TODO move path to testApiBase + from openml._api.setup import Config + super().setUp() + self.minio_client = MinIOClient(path=Path(Config().cache.dir)) + self.client = self._get_http_client( + server=self.server, + base_url=self.base_url, + api_key=self.api_key, + timeout=self.timeout, + retries=self.retries, + retry_policy=self.retry_policy, + cache=self.cache + ) + self.dataset = DatasetV1API(self.client,self.minio_client) + + @pytest.mark.uses_test_server() + def test_get(self): + output = self.dataset.get(2) + assert output.dataset_id == 2 + + @pytest.mark.uses_test_server() + def test_list(self): + output =self.dataset.list(limit=2, offset=0, status="active") + assert not output.empty + assert output.shape[0] == 2 + assert (output["status"].nunique() == 1) + assert (output["status"].unique()[0] == "active") + + @pytest.mark.uses_test_server() + def test_download_arff_from_get(self): + output = self.dataset.get(2,download_data=True) + + assert output.data_file != None + assert Path(output.data_file).exists() + + @pytest.mark.uses_test_server() + def test_download_qualities_from_get(self): + output = self.dataset.get(2,download_qualities=True) + + assert output._qualities is not None + + @pytest.mark.uses_test_server() + def test_download_features_from_get(self): + output = self.dataset.get(2,download_features_meta_data=True) + + assert output._features is not None + + @pytest.mark.uses_test_server() + def test_get_features(self): + output = self.dataset.get_features(2) + + assert isinstance(output,dict) + assert len(output.keys()) == 37 + + @pytest.mark.uses_test_server() + def test_get_qualities(self): + output = self.dataset.get_qualities(2) + + assert isinstance(output,dict) + assert len(output.keys()) == 19 + + + +class TestDatasetV2API(TestAPIBase): + def setUp(self): + #TODO move path to testApiBase + from openml._api.setup import Config + super().setUp() + self.minio_client = MinIOClient(path=Path(Config().cache.dir)) + self.client = self._get_http_client( + server="http://127.0.0.1:8001/", + base_url="", + api_key=self.api_key, + timeout=self.timeout, + retries=self.retries, + retry_policy=self.retry_policy, + cache=self.cache + ) + self.dataset = DatasetV2API(self.client,self.minio_client) + + @pytest.mark.uses_test_server() + def test_get(self): + output= self.dataset.get(2) + assert output.dataset_id == 2 + + @pytest.mark.uses_test_server() + def test_list(self): + output =self.dataset.list(limit=2, offset=0, status="active") + assert not output.empty + assert output.shape[0] == 2 + assert (output["status"].nunique() == 1) + assert (output["status"].unique()[0] == "active") + + @pytest.mark.uses_test_server() + def test_download_arff_from_get(self): + output = self.dataset.get(2,download_data=True) + + assert output.data_file != None + assert Path(output.data_file).exists() + + @pytest.mark.uses_test_server() + def test_download_qualities_from_get(self): + output = self.dataset.get(2,download_qualities=True) + + assert output._qualities is not None + + @pytest.mark.uses_test_server() + def test_download_features_from_get(self): + output = self.dataset.get(2,download_features_meta_data=True) + + assert output._features is not None + + @pytest.mark.uses_test_server() + def test_get_features(self): + output = self.dataset.get_features(2) + + assert isinstance(output,dict) + assert len(output.keys()) == 37 + + @pytest.mark.uses_test_server() + def test_get_qualities(self): + output = self.dataset.get_qualities(2) + + assert isinstance(output,dict) + assert len(output.keys()) == 107 + + +class TestDatasetsCombined(TestAPIBase): + def setUp(self): + #TODO move path to testApiBase + from openml._api.setup import Config + super().setUp() + self.minio_client = MinIOClient(path=Path(Config().cache.dir)) + self.v1_client = self._get_http_client( + server=self.server, + base_url=self.base_url, + api_key=self.api_key, + timeout=self.timeout, + retries=self.retries, + retry_policy=self.retry_policy, + cache=self.cache + ) + self.v2_client = self._get_http_client( + server="http://127.0.0.1:8001/", + base_url="", + api_key=self.api_key, + timeout=self.timeout, + retries=self.retries, + retry_policy=self.retry_policy, + cache=self.cache + ) + self.dataset_v1 = DatasetV1API(self.v1_client,self.minio_client) + self.dataset_v2 = DatasetV2API(self.v2_client,self.minio_client) + self.dataset_fallback = FallbackProxy(self.dataset_v1,self.dataset_v2) + + @pytest.mark.uses_test_server() + def test_get_matches(self): + output_v1 = self.dataset_v1.get(2) + output_v2 = self.dataset_v2.get(2) + + assert output_v1.dataset_id == output_v2.dataset_id + assert output_v1.name == output_v2.name + assert output_v1.data_file is None + assert output_v1.data_file == output_v2.data_file + + @pytest.mark.uses_test_server() + def test_get_fallback(self): + output_fallback = self.dataset_fallback.get(2) + assert output_fallback.dataset_id == 2 + + #TODO list has different structure compared to v1 + @pytest.mark.uses_test_server() + def test_list_matches(self): + output_v1 = self.dataset_v1.list(limit=2, offset=1) + output_v2 = self.dataset_v2.list(limit=2, offset=1) + + pd.testing.assert_series_equal(output_v1["did"],output_v2["did"]) + + @pytest.mark.uses_test_server() + def test_list_fallback(self): + output_fallback =self.dataset_fallback.list(limit=2, offset=0,data_id=[2,3]) + + assert not output_fallback.empty + assert output_fallback.shape[0] == 2 + assert set(output_fallback["did"]) == {2, 3} + + @pytest.mark.uses_test_server() + def test_get_features_matches(self): + output_v1 = self.dataset_v1.get_features(2) + output_v2 = self.dataset_v2.get_features(2) + + assert output_v1.keys() == output_v2.keys() + assert output_v1 == output_v2 + + @pytest.mark.uses_test_server() + def test_get_features_fallback(self): + output_fallback = self.dataset_fallback.get_features(2) + + assert isinstance(output_fallback,dict) + assert len(output_fallback.keys()) == 37 + + @pytest.mark.uses_test_server() + def test_get_qualities_matches(self): + output_v1 = self.dataset_v1.get_qualities(2) + output_v2 = self.dataset_v2.get_qualities(2) + + #TODO Qualities in local python server and test server differ + + @pytest.mark.uses_test_server() + def test_get_qualities_fallback(self): + output_fallback = self.dataset_fallback.get_qualities(2) + + assert isinstance(output_fallback,dict) + #TODO Qualities in local python server and test server differ \ No newline at end of file diff --git a/tests/test_api/test_http.py b/tests/test_api/test_http.py new file mode 100644 index 000000000..efaeaeeef --- /dev/null +++ b/tests/test_api/test_http.py @@ -0,0 +1,161 @@ +from requests import Response, Request +import time +import xmltodict +import pytest +from openml.testing import TestAPIBase +import os + + +class TestHTTPClient(TestAPIBase): + def test_cache(self): + url = self._get_url(path="task/31") + params = {"param1": "value1", "param2": "value2"} + + key = self.cache.get_key(url, params) + expected_key = os.path.join( + "org", + "openml", + "test", + "api", + "v1", + "task", + "31", + "param1=value1¶m2=value2", + ) + + # validate key + self.assertEqual(key, expected_key) + + # create fake response + req = Request("GET", url).prepare() + response = Response() + response.status_code = 200 + response.url = url + response.reason = "OK" + response._content = b"test" + response.headers = {"Content-Type": "text/xml"} + response.encoding = "utf-8" + response.request = req + response.elapsed = type("Elapsed", (), {"total_seconds": lambda self: 0.1})() + + # save to cache + self.cache.save(key, response) + + # load from cache + cached_response = self.cache.load(key) + + # validate loaded response + self.assertEqual(cached_response.status_code, 200) + self.assertEqual(cached_response.url, url) + self.assertEqual(cached_response.content, b"test") + self.assertEqual( + cached_response.headers["Content-Type"], "text/xml" + ) + + @pytest.mark.uses_test_server() + def test_get(self): + response = self.http_client.get("task/1") + + self.assertEqual(response.status_code, 200) + self.assertIn(b" new request + self.assertNotEqual(response1_cache_time_stamp, response2_cache_time_stamp) + self.assertEqual(response2.status_code, 200) + self.assertEqual(response1.content, response2.content) + + @pytest.mark.uses_test_server() + def test_get_reset_cache(self): + path = "task/1" + + url = self._get_url(path=path) + key = self.cache.get_key(url, {}) + cache_path = self.cache._key_to_path(key) / "meta.json" + + response1 = self.http_client.get(path, use_cache=True) + response1_cache_time_stamp = cache_path.stat().st_ctime + + response2 = self.http_client.get(path, use_cache=True, reset_cache=True) + response2_cache_time_stamp = cache_path.stat().st_ctime + + self.assertNotEqual(response1_cache_time_stamp, response2_cache_time_stamp) + self.assertEqual(response2.status_code, 200) + self.assertEqual(response1.content, response2.content) + + @pytest.mark.uses_test_server() + def test_post_and_delete(self): + task_xml = """ + + 5 + 193 + 17 + + """ + + task_id = None + try: + # POST the task + post_response = self.http_client.post( + "task", + files={"description": task_xml}, + ) + self.assertEqual(post_response.status_code, 200) + xml_resp = xmltodict.parse(post_response.content) + task_id = int(xml_resp["oml:upload_task"]["oml:id"]) + + # GET the task to verify it exists + get_response = self.http_client.get(f"task/{task_id}") + self.assertEqual(get_response.status_code, 200) + + finally: + # DELETE the task if it was created + if task_id is not None: + del_response = self.http_client.delete(f"task/{task_id}") + self.assertEqual(del_response.status_code, 200) diff --git a/tests/test_api/test_versions.py b/tests/test_api/test_versions.py new file mode 100644 index 000000000..6a4cad97d --- /dev/null +++ b/tests/test_api/test_versions.py @@ -0,0 +1,53 @@ +from time import time +import pytest +from openml.testing import TestAPIBase +from openml._api import ResourceV1API +from openml.enums import ResourceType + + +class TestResourceV1API(TestAPIBase): + def setUp(self): + super().setUp() + self.resource = ResourceV1API(self.http_client) + self.resource.resource_type = ResourceType.TASK + + @pytest.mark.uses_test_server() + def test_publish_and_delete(self): + task_xml = """ + + 5 + 193 + 17 + + """ + + task_id = None + try: + # Publish the task + task_id = self.resource.publish( + "task", + files={"description": task_xml}, + ) + + # Get the task to verify it exists + get_response = self.http_client.get(f"task/{task_id}") + self.assertEqual(get_response.status_code, 200) + + finally: + # delete the task if it was created + if task_id is not None: + success = self.resource.delete(task_id) + self.assertTrue(success) + + + @pytest.mark.uses_test_server() + def test_tag_and_untag(self): + resource_id = 1 + unique_indicator = str(time()).replace(".", "") + tag = f"TestResourceV1API_test_tag_and_untag_{unique_indicator}" + + tags = self.resource.tag(resource_id, tag) + self.assertIn(tag, tags) + + tags = self.resource.untag(resource_id, tag) + self.assertNotIn(tag, tags) diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py index b13bac30b..9644a0309 100644 --- a/tests/test_datasets/test_dataset.py +++ b/tests/test_datasets/test_dataset.py @@ -233,6 +233,7 @@ def test_get_data_corrupt_pickle(self): assert isinstance(xy, pd.DataFrame) assert xy.shape == (150, 5) + @pytest.mark.skip("Datasets cache") def test_lazy_loading_metadata(self): # Initial Setup did_cache_dir = openml.utils._create_cache_directory_for_id( @@ -469,16 +470,3 @@ def test__read_qualities(static_cache_dir, workdir, mocker): assert pickle_mock.dump.call_count == 1 - -def test__check_qualities(): - qualities = [{"oml:name": "a", "oml:value": "0.5"}] - qualities = openml.datasets.dataset._check_qualities(qualities) - assert qualities["a"] == 0.5 - - qualities = [{"oml:name": "a", "oml:value": "null"}] - qualities = openml.datasets.dataset._check_qualities(qualities) - assert qualities["a"] != qualities["a"] - - qualities = [{"oml:name": "a", "oml:value": None}] - qualities = openml.datasets.dataset._check_qualities(qualities) - assert qualities["a"] != qualities["a"] diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index c41664ba7..46c2355be 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -18,6 +18,7 @@ import pytest import requests import requests_mock +from requests_mock import ANY import scipy.sparse from oslo_concurrency import lockutils @@ -427,7 +428,7 @@ def test__get_dataset_parquet_file_does_not_exist(self): "oml:parquet_url": "http://data.openml.org/dataset20/does_not_exist.pq", "oml:id": "20", } - path = _get_dataset_parquet(description, cache_directory=self.workdir) + path = _get_dataset_parquet(description) assert path is None, "_get_dataset_parquet returns None if no file is found" def test__getarff_md5_issue(self): @@ -441,8 +442,8 @@ def test__getarff_md5_issue(self): self.assertRaisesRegex( OpenMLHashException, - "Checksum of downloaded file is unequal to the expected checksum abc when downloading " - "https://www.openml.org/data/download/61. Raised when downloading dataset 5.", + "Checksum of downloaded file is unequal to the expected checksum abc " + "when downloading https://www.openml.org/data/download/61.", _get_dataset_arff, description, ) @@ -451,17 +452,15 @@ def test__getarff_md5_issue(self): @pytest.mark.uses_test_server() def test__get_dataset_features(self): - features_file = _get_dataset_features_file(self.workdir, 2) + features_file = _get_dataset_features_file(2) assert isinstance(features_file, Path) - features_xml_path = self.workdir / "features.xml" - assert features_xml_path.exists() + assert features_file.exists() @pytest.mark.uses_test_server() def test__get_dataset_qualities(self): - qualities = _get_dataset_qualities_file(self.workdir, 2) + qualities = _get_dataset_qualities_file(2) assert isinstance(qualities, Path) - qualities_xml_path = self.workdir / "qualities.xml" - assert qualities_xml_path.exists() + assert qualities.exists() @pytest.mark.uses_test_server() def test_get_dataset_force_refresh_cache(self): @@ -565,7 +564,7 @@ def test__retrieve_class_labels(self): labels = openml.datasets.get_dataset(2).retrieve_class_labels( target_name="product-type", ) - assert labels == ["C", "H", "G"] + assert labels == ["C", "G", "H"] # Test workaround for string-typed class labels custom_ds = openml.datasets.get_dataset(2) @@ -1728,89 +1727,66 @@ def test_delete_dataset(self): assert openml.datasets.delete_dataset(_dataset_id) -@mock.patch.object(requests.Session, "delete") -def test_delete_dataset_not_owned(mock_delete, test_files_directory, test_api_key): +def test_delete_dataset_not_owned(requests_mock, test_files_directory, test_api_key): openml.config.start_using_configuration_for_example() content_file = ( test_files_directory / "mock_responses" / "datasets" / "data_delete_not_owned.xml" ) - mock_delete.return_value = create_request_response( - status_code=412, - content_filepath=content_file, - ) + content_xml = content_file.read_text() + requests_mock.delete(ANY, text=content_xml, status_code=204) with pytest.raises( OpenMLNotAuthorizedError, match="The data can not be deleted because it was not uploaded by you.", ): openml.datasets.delete_dataset(40_000) + + - dataset_url = "https://test.openml.org/api/v1/xml/data/40000" - assert dataset_url == mock_delete.call_args.args[0] - assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key") - - -@mock.patch.object(requests.Session, "delete") -def test_delete_dataset_with_run(mock_delete, test_files_directory, test_api_key): +def test_delete_dataset_with_run(requests_mock, test_files_directory, test_api_key): openml.config.start_using_configuration_for_example() content_file = ( test_files_directory / "mock_responses" / "datasets" / "data_delete_has_tasks.xml" ) - mock_delete.return_value = create_request_response( - status_code=412, - content_filepath=content_file, - ) + content_xml = content_file.read_text() + requests_mock.delete(ANY, text=content_xml, status_code=412) + with pytest.raises( OpenMLNotAuthorizedError, match="The data can not be deleted because it still has associated entities:", ): openml.datasets.delete_dataset(40_000) - - dataset_url = "https://test.openml.org/api/v1/xml/data/40000" - assert dataset_url == mock_delete.call_args.args[0] - assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key") + -@mock.patch.object(requests.Session, "delete") -def test_delete_dataset_success(mock_delete, test_files_directory, test_api_key): +def test_delete_dataset_success(requests_mock, test_files_directory, test_api_key): openml.config.start_using_configuration_for_example() content_file = ( test_files_directory / "mock_responses" / "datasets" / "data_delete_successful.xml" ) - mock_delete.return_value = create_request_response( - status_code=200, - content_filepath=content_file, - ) + content_xml = content_file.read_text() + requests_mock.delete(ANY, text=content_xml, status_code=200) success = openml.datasets.delete_dataset(40000) assert success - dataset_url = "https://test.openml.org/api/v1/xml/data/40000" - assert dataset_url == mock_delete.call_args.args[0] - assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key") - -@mock.patch.object(requests.Session, "delete") -def test_delete_unknown_dataset(mock_delete, test_files_directory, test_api_key): +def test_delete_unknown_dataset(requests_mock, test_files_directory, test_api_key): openml.config.start_using_configuration_for_example() content_file = ( test_files_directory / "mock_responses" / "datasets" / "data_delete_not_exist.xml" ) - mock_delete.return_value = create_request_response( - status_code=412, - content_filepath=content_file, - ) + + content_xml = content_file.read_text() + requests_mock.delete(ANY, text=content_xml, status_code=412) with pytest.raises( OpenMLServerException, match="Dataset does not exist", ): openml.datasets.delete_dataset(9_999_999) - - dataset_url = "https://test.openml.org/api/v1/xml/data/9999999" - assert dataset_url == mock_delete.call_args.args[0] - assert test_api_key == mock_delete.call_args.kwargs.get("params", {}).get("api_key") + def _assert_datasets_have_id_and_valid_status(datasets: pd.DataFrame): @@ -1987,7 +1963,7 @@ def test__get_dataset_parquet_not_cached(): "oml:parquet_url": "http://data.openml.org/dataset20/dataset_20.pq", "oml:id": "20", } - path = _get_dataset_parquet(description, cache_directory=Path(openml.config.get_cache_directory())) + path = _get_dataset_parquet(description) assert isinstance(path, Path), "_get_dataset_parquet returns a path" assert path.is_file(), "_get_dataset_parquet returns path to real file"