diff --git a/openml/__init__.py b/openml/__init__.py index ae5db261f..a7c95dc2e 100644 --- a/openml/__init__.py +++ b/openml/__init__.py @@ -33,6 +33,7 @@ utils, ) from .__version__ import __version__ +from ._api.runtime.instance import _backend from .datasets import OpenMLDataFeature, OpenMLDataset from .evaluations import OpenMLEvaluation from .flows import OpenMLFlow @@ -109,6 +110,7 @@ def populate_cache( "OpenMLTask", "__version__", "_api_calls", + "_backend", "config", "datasets", "evaluations", diff --git a/openml/_api/__init__.py b/openml/_api/__init__.py new file mode 100644 index 000000000..68b419d92 --- /dev/null +++ b/openml/_api/__init__.py @@ -0,0 +1,24 @@ +"""OpenML API module.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from openml._api.runtime.instance import _backend as backend + +if TYPE_CHECKING: + from openml._api.runtime.core import APIBackend + +__all__ = ["api_context"] + + +class APIContext: + """API context for accessing the OpenML backend.""" + + @property + def backend(self) -> APIBackend: + """Get the API backend instance.""" + return backend + + +api_context = APIContext() diff --git a/openml/_api/clients/__init__.py b/openml/_api/clients/__init__.py new file mode 100644 index 000000000..42f11fbcf --- /dev/null +++ b/openml/_api/clients/__init__.py @@ -0,0 +1,8 @@ +from .http import HTTPCache, HTTPClient +from .minio import MinIOClient + +__all__ = [ + "HTTPCache", + "HTTPClient", + "MinIOClient", +] diff --git a/openml/_api/clients/http.py b/openml/_api/clients/http.py new file mode 100644 index 000000000..dfcdf5a8a --- /dev/null +++ b/openml/_api/clients/http.py @@ -0,0 +1,430 @@ +from __future__ import annotations + +import json +import logging +import math +import random +import time +import xml +from collections.abc import Mapping +from pathlib import Path +from typing import Any +from urllib.parse import urlencode, urljoin, urlparse + +import requests +import xmltodict +from requests import Response + +from openml.__version__ import __version__ +from openml._api.config import RetryPolicy +from openml.exceptions import ( + OpenMLNotAuthorizedError, + OpenMLServerError, + OpenMLServerException, + OpenMLServerNoResult, +) + + +class HTTPCache: + def __init__(self, *, path: Path, ttl: int) -> None: + self.path = path + self.ttl = ttl + + def get_key(self, url: str, params: dict[str, Any]) -> str: + parsed_url = urlparse(url) + netloc_parts = parsed_url.netloc.split(".")[::-1] + path_parts = parsed_url.path.strip("/").split("/") + + filtered_params = {k: v for k, v in params.items() if k != "api_key"} + params_part = [urlencode(filtered_params)] if filtered_params else [] + + return str(Path(*netloc_parts, *path_parts, *params_part)) + + def _key_to_path(self, key: str) -> Path: + return self.path.joinpath(key) + + def load(self, key: str) -> Response: + path = self._key_to_path(key) + + if not path.exists(): + raise FileNotFoundError(f"Cache directory not found: {path}") + + meta_path = path / "meta.json" + headers_path = path / "headers.json" + body_path = path / "body.bin" + + if not (meta_path.exists() and headers_path.exists() and body_path.exists()): + raise FileNotFoundError(f"Incomplete cache at {path}") + + with meta_path.open("r", encoding="utf-8") as f: + meta = json.load(f) + + created_at = meta.get("created_at") + if created_at is None: + raise ValueError("Cache metadata missing 'created_at'") + + if time.time() - created_at > self.ttl: + raise TimeoutError(f"Cache expired for {path}") + + with headers_path.open("r", encoding="utf-8") as f: + headers = json.load(f) + + body = body_path.read_bytes() + + response = Response() + response.status_code = meta["status_code"] + response.url = meta["url"] + response.reason = meta["reason"] + response.headers = headers + response._content = body + response.encoding = meta["encoding"] + + return response + + def save(self, key: str, response: Response) -> None: + path = self._key_to_path(key) + path.mkdir(parents=True, exist_ok=True) + + (path / "body.bin").write_bytes(response.content) + + with (path / "headers.json").open("w", encoding="utf-8") as f: + json.dump(dict(response.headers), f) + + meta = { + "status_code": response.status_code, + "url": response.url, + "reason": response.reason, + "encoding": response.encoding, + "elapsed": response.elapsed.total_seconds(), + "created_at": time.time(), + "request": { + "method": response.request.method if response.request else None, + "url": response.request.url if response.request else None, + "headers": dict(response.request.headers) if response.request else None, + "body": response.request.body if response.request else None, + }, + } + + with (path / "meta.json").open("w", encoding="utf-8") as f: + json.dump(meta, f) + + +class HTTPClient: + def __init__( # noqa: PLR0913 + self, + *, + server: str, + base_url: str, + api_key: str, + timeout: int, + retries: int, + retry_policy: RetryPolicy, + cache: HTTPCache | None = None, + ) -> None: + self.server = server + self.base_url = base_url + self.api_key = api_key + self.timeout = timeout + self.retries = retries + self.retry_policy = retry_policy + self.cache = cache + + self.retry_func = ( + self._human_delay if retry_policy == RetryPolicy.HUMAN else self._robot_delay + ) + self.headers: dict[str, str] = {"user-agent": f"openml-python/{__version__}"} + + def _robot_delay(self, n: int) -> float: + wait = (1 / (1 + math.exp(-(n * 0.5 - 4)))) * 60 + variation = random.gauss(0, wait / 10) + return max(1.0, wait + variation) + + def _human_delay(self, n: int) -> float: + return max(1.0, n) + + def _parse_exception_response( + self, + response: Response, + ) -> tuple[int | None, str]: + content_type = response.headers.get("Content-Type", "").lower() + + if "json" in content_type: + server_exception = response.json() + server_error = server_exception["detail"] + code = server_error.get("code") + message = server_error.get("message") + additional_information = server_error.get("additional_information") + else: + server_exception = xmltodict.parse(response.text) + server_error = server_exception["oml:error"] + code = server_error.get("oml:code") + message = server_error.get("oml:message") + additional_information = server_error.get("oml:additional_information") + + if code is not None: + code = int(code) + + if message and additional_information: + full_message = f"{message} - {additional_information}" + elif message: + full_message = message + elif additional_information: + full_message = additional_information + else: + full_message = "" + + return code, full_message + + def _raise_code_specific_error( + self, + code: int, + message: str, + url: str, + files: Mapping[str, Any] | None, + ) -> None: + if code in [111, 372, 512, 500, 482, 542, 674]: + # 512 for runs, 372 for datasets, 500 for flows + # 482 for tasks, 542 for evaluations, 674 for setups + # 111 for dataset descriptions + raise OpenMLServerNoResult(code=code, message=message, url=url) + + # 163: failure to validate flow XML (https://www.openml.org/api_docs#!/flow/post_flow) + if code in [163] and files is not None and "description" in files: + # file_elements['description'] is the XML file description of the flow + message = f"\n{files['description']}\n{message}" + + if code in [ + 102, # flow/exists post + 137, # dataset post + 350, # dataset/42 delete + 310, # flow/ post + 320, # flow/42 delete + 400, # run/42 delete + 460, # task/42 delete + ]: + raise OpenMLNotAuthorizedError( + message=( + f"The API call {url} requires authentication via an API key.\nPlease configure " + "OpenML-Python to use your API as described in this example:" + "\nhttps://openml.github.io/openml-python/latest/examples/Basics/introduction_tutorial/#authentication" + ) + ) + + # Propagate all server errors to the calling functions, except + # for 107 which represents a database connection error. + # These are typically caused by high server load, + # which means trying again might resolve the issue. + # DATABASE_CONNECTION_ERRCODE + if code != 107: + raise OpenMLServerException(code=code, message=message, url=url) + + def _validate_response( + self, + method: str, + url: str, + files: Mapping[str, Any] | None, + response: Response, + ) -> Exception | None: + if ( + "Content-Encoding" not in response.headers + or response.headers["Content-Encoding"] != "gzip" + ): + logging.warning(f"Received uncompressed content from OpenML for {url}.") + + if response.status_code == 200: + return None + + if response.status_code == requests.codes.URI_TOO_LONG: + raise OpenMLServerError(f"URI too long! ({url})") + + retry_raise_e: Exception | None = None + + try: + code, message = self._parse_exception_response(response) + + except (requests.exceptions.JSONDecodeError, xml.parsers.expat.ExpatError) as e: + if method != "GET": + extra = f"Status code: {response.status_code}\n{response.text}" + raise OpenMLServerError( + f"Unexpected server error when calling {url}. Please contact the " + f"developers!\n{extra}" + ) from e + + retry_raise_e = e + + except Exception as e: + # If we failed to parse it out, + # then something has gone wrong in the body we have sent back + # from the server and there is little extra information we can capture. + raise OpenMLServerError( + f"Unexpected server error when calling {url}. Please contact the developers!\n" + f"Status code: {response.status_code}\n{response.text}", + ) from e + + if code is not None: + self._raise_code_specific_error( + code=code, + message=message, + url=url, + files=files, + ) + + if retry_raise_e is None: + retry_raise_e = OpenMLServerException(code=code, message=message, url=url) + + return retry_raise_e + + def _request( # noqa: PLR0913 + self, + method: str, + url: str, + params: Mapping[str, Any], + data: Mapping[str, Any], + headers: Mapping[str, str], + timeout: float | int, + files: Mapping[str, Any] | None, + **request_kwargs: Any, + ) -> tuple[Response | None, Exception | None]: + retry_raise_e: Exception | None = None + response: Response | None = None + + try: + response = requests.request( + method=method, + url=url, + params=params, + data=data, + headers=headers, + timeout=timeout, + files=files, + **request_kwargs, + ) + except ( + requests.exceptions.ChunkedEncodingError, + requests.exceptions.ConnectionError, + requests.exceptions.SSLError, + ) as e: + retry_raise_e = e + + if response is not None: + retry_raise_e = self._validate_response( + method=method, + url=url, + files=files, + response=response, + ) + + return response, retry_raise_e + + def request( + self, + method: str, + path: str, + *, + use_cache: bool = False, + reset_cache: bool = False, + use_api_key: bool = False, + **request_kwargs: Any, + ) -> Response: + url = urljoin(self.server, urljoin(self.base_url, path)) + retries = max(1, self.retries) + + params = request_kwargs.pop("params", {}).copy() + data = request_kwargs.pop("data", {}).copy() + + if use_api_key: + params["api_key"] = self.api_key + + if method.upper() in {"POST", "PUT", "PATCH"}: + data = {**params, **data} + params = {} + + # prepare headers + headers = request_kwargs.pop("headers", {}).copy() + headers.update(self.headers) + + timeout = request_kwargs.pop("timeout", self.timeout) + files = request_kwargs.pop("files", None) + + if use_cache and not reset_cache and self.cache is not None: + cache_key = self.cache.get_key(url, params) + try: + return self.cache.load(cache_key) + except (FileNotFoundError, TimeoutError): + pass # cache miss or expired, continue + except Exception: + raise # propagate unexpected cache errors + + for retry_counter in range(1, retries + 1): + response, retry_raise_e = self._request( + method=method, + url=url, + params=params, + data=data, + headers=headers, + timeout=timeout, + files=files, + **request_kwargs, + ) + + # executed successfully + if retry_raise_e is None: + break + # tries completed + if retry_counter >= retries: + raise retry_raise_e + + delay = self.retry_func(retry_counter) + time.sleep(delay) + + assert response is not None + + if use_cache and self.cache is not None: + cache_key = self.cache.get_key(url, params) + self.cache.save(cache_key, response) + + return response + + def get( + self, + path: str, + *, + use_cache: bool = False, + reset_cache: bool = False, + use_api_key: bool = False, + **request_kwargs: Any, + ) -> Response: + return self.request( + method="GET", + path=path, + use_cache=use_cache, + reset_cache=reset_cache, + use_api_key=use_api_key, + **request_kwargs, + ) + + def post( + self, + path: str, + **request_kwargs: Any, + ) -> Response: + return self.request( + method="POST", + path=path, + use_cache=False, + use_api_key=True, + **request_kwargs, + ) + + def delete( + self, + path: str, + **request_kwargs: Any, + ) -> Response: + return self.request( + method="DELETE", + path=path, + use_cache=False, + use_api_key=True, + **request_kwargs, + ) diff --git a/openml/_api/clients/minio.py b/openml/_api/clients/minio.py new file mode 100644 index 000000000..2edc8269b --- /dev/null +++ b/openml/_api/clients/minio.py @@ -0,0 +1,11 @@ +from __future__ import annotations + +from pathlib import Path + +from openml.__version__ import __version__ + + +class MinIOClient: + def __init__(self, path: Path | None = None) -> None: + self.path = path + self.headers: dict[str, str] = {"user-agent": f"openml-python/{__version__}"} diff --git a/openml/_api/config.py b/openml/_api/config.py new file mode 100644 index 000000000..3afbf224f --- /dev/null +++ b/openml/_api/config.py @@ -0,0 +1,114 @@ +from __future__ import annotations + +from dataclasses import dataclass +from enum import Enum + + +class APIVersion(str, Enum): + V1 = "v1" + V2 = "v2" + + +class ResourceType(str, Enum): + DATASET = "dataset" + TASK = "task" + TASK_TYPE = "task_type" + EVALUATION_MEASURE = "evaluation_measure" + ESTIMATION_PROCEDURE = "estimation_procedure" + EVALUATION = "evaluation" + FLOW = "flow" + STUDY = "study" + RUN = "run" + SETUP = "setup" + USER = "user" + + +class RetryPolicy(str, Enum): + HUMAN = "human" + ROBOT = "robot" + + +@dataclass +class APIConfig: + server: str + base_url: str + api_key: str + timeout: int = 10 # seconds + + +@dataclass +class ConnectionConfig: + retries: int = 3 + retry_policy: RetryPolicy = RetryPolicy.HUMAN + + +@dataclass +class CacheConfig: + dir: str = "~/.openml/cache" + ttl: int = 60 * 60 * 24 * 7 # one week + + +class Settings: + """Settings container that reads from openml.config on access.""" + + _instance: Settings | None = None + + def __init__(self) -> None: + self.api_configs: dict[str, APIConfig] = {} + self.connection = ConnectionConfig() + self.cache = CacheConfig() + self._initialized = False + + @classmethod + def get(cls) -> Settings: + """Get settings singleton, creating on first access.""" + if cls._instance is None: + cls._instance = cls() + return cls._instance + + @classmethod + def reset(cls) -> None: + """Reset the settings singleton. Useful for testing.""" + cls._instance = None + + def get_api_config(self, version: str) -> APIConfig: + """Get API config for a version, with lazy initialization from openml.config.""" + if not self._initialized: + self._init_from_legacy_config() + if version not in self.api_configs: + raise NotImplementedError( + f"API {version} is not yet available. " + f"Supported versions: {list(self.api_configs.keys())}" + ) + return self.api_configs[version] + + def _init_from_legacy_config(self) -> None: + """Lazy init from openml.config to avoid circular imports.""" + if self._initialized: + return + + # Import here (not at module level) to avoid circular imports. + # We read from openml.config to integrate with the existing config system + # where users set their API key, server, cache directory, etc. + # This avoids duplicating those settings with hardcoded values. + import openml.config as legacy + + server_url = legacy.server + server_base = server_url.rsplit("/api", 1)[0] + "/" if "/api" in server_url else server_url + + self.api_configs["v1"] = APIConfig( + server=server_base, + base_url="api/v1/xml/", + api_key=legacy.apikey, + ) + + # Sync connection- and cache- settings from legacy config + self.connection = ConnectionConfig( + retries=legacy.connection_n_retries, + retry_policy=RetryPolicy(legacy.retry_policy), + ) + self.cache = CacheConfig( + dir=str(legacy._root_cache_directory), + ) + + self._initialized = True diff --git a/openml/_api/resources/__init__.py b/openml/_api/resources/__init__.py new file mode 100644 index 000000000..b666c018b --- /dev/null +++ b/openml/_api/resources/__init__.py @@ -0,0 +1,35 @@ +from openml._api.resources.base.fallback import FallbackProxy +from openml._api.resources.dataset import DatasetV1API, DatasetV2API +from openml._api.resources.estimation_procedure import ( + EstimationProcedureV1API, + EstimationProcedureV2API, +) +from openml._api.resources.evaluation import EvaluationV1API, EvaluationV2API +from openml._api.resources.evaluation_measure import EvaluationMeasureV1API, EvaluationMeasureV2API +from openml._api.resources.flow import FlowV1API, FlowV2API +from openml._api.resources.run import RunV1API, RunV2API +from openml._api.resources.setup import SetupV1API, SetupV2API +from openml._api.resources.study import StudyV1API, StudyV2API +from openml._api.resources.task import TaskV1API, TaskV2API + +__all__ = [ + "DatasetV1API", + "DatasetV2API", + "EstimationProcedureV1API", + "EstimationProcedureV2API", + "EvaluationMeasureV1API", + "EvaluationMeasureV2API", + "EvaluationV1API", + "EvaluationV2API", + "FallbackProxy", + "FlowV1API", + "FlowV2API", + "RunV1API", + "RunV2API", + "SetupV1API", + "SetupV2API", + "StudyV1API", + "StudyV2API", + "TaskV1API", + "TaskV2API", +] diff --git a/openml/_api/resources/base/__init__.py b/openml/_api/resources/base/__init__.py new file mode 100644 index 000000000..f222a0b87 --- /dev/null +++ b/openml/_api/resources/base/__init__.py @@ -0,0 +1,30 @@ +from openml._api.resources.base.base import ResourceAPI +from openml._api.resources.base.fallback import FallbackProxy +from openml._api.resources.base.resources import ( + DatasetAPI, + EstimationProcedureAPI, + EvaluationAPI, + EvaluationMeasureAPI, + FlowAPI, + RunAPI, + SetupAPI, + StudyAPI, + TaskAPI, +) +from openml._api.resources.base.versions import ResourceV1API, ResourceV2API + +__all__ = [ + "DatasetAPI", + "EstimationProcedureAPI", + "EvaluationAPI", + "EvaluationMeasureAPI", + "FallbackProxy", + "FlowAPI", + "ResourceAPI", + "ResourceV1API", + "ResourceV2API", + "RunAPI", + "SetupAPI", + "StudyAPI", + "TaskAPI", +] diff --git a/openml/_api/resources/base/base.py b/openml/_api/resources/base/base.py new file mode 100644 index 000000000..dbe3e95ea --- /dev/null +++ b/openml/_api/resources/base/base.py @@ -0,0 +1,43 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, NoReturn + +from openml.exceptions import OpenMLNotSupportedError + +if TYPE_CHECKING: + from collections.abc import Mapping + from typing import Any + + from openml._api.clients import HTTPClient + from openml._api.config import APIVersion, ResourceType + + +class ResourceAPI(ABC): + api_version: APIVersion + resource_type: ResourceType + + def __init__(self, http: HTTPClient): + self._http = http + + @abstractmethod + def delete(self, resource_id: int) -> bool: ... + + @abstractmethod + def publish(self, path: str, files: Mapping[str, Any] | None) -> int: ... + + @abstractmethod + def tag(self, resource_id: int, tag: str) -> list[str]: ... + + @abstractmethod + def untag(self, resource_id: int, tag: str) -> list[str]: ... + + def _not_supported(self, *, method: str) -> NoReturn: + version = getattr(self.api_version, "value", "unknown") + resource = getattr(self.resource_type, "value", "unknown") + + raise OpenMLNotSupportedError( + f"{self.__class__.__name__}: " + f"{version} API does not support `{method}` " + f"for resource `{resource}`" + ) diff --git a/openml/_api/resources/base/fallback.py b/openml/_api/resources/base/fallback.py new file mode 100644 index 000000000..3919c36a9 --- /dev/null +++ b/openml/_api/resources/base/fallback.py @@ -0,0 +1,58 @@ +from __future__ import annotations + +from collections.abc import Callable +from typing import Any + +from openml.exceptions import OpenMLNotSupportedError + + +class FallbackProxy: + def __init__(self, *api_versions: Any): + if not api_versions: + raise ValueError("At least one API version must be provided") + self._apis = api_versions + + def __getattr__(self, name: str) -> Any: + api, attr = self._find_attr(name) + if callable(attr): + return self._wrap_callable(name, api, attr) + return attr + + def _find_attr(self, name: str) -> tuple[Any, Any]: + for api in self._apis: + attr = getattr(api, name, None) + if attr is not None: + return api, attr + raise AttributeError(f"{self.__class__.__name__} has no attribute {name}") + + def _wrap_callable( + self, + name: str, + primary_api: Any, + primary_attr: Callable[..., Any], + ) -> Callable[..., Any]: + def wrapper(*args: Any, **kwargs: Any) -> Any: + try: + return primary_attr(*args, **kwargs) + except OpenMLNotSupportedError: + return self._call_fallbacks(name, primary_api, *args, **kwargs) + + return wrapper + + def _call_fallbacks( + self, + name: str, + skip_api: Any, + *args: Any, + **kwargs: Any, + ) -> Any: + for api in self._apis: + if api is skip_api: + continue + attr = getattr(api, name, None) + if callable(attr): + try: + return attr(*args, **kwargs) + except OpenMLNotSupportedError: + continue + raise OpenMLNotSupportedError(f"Could not fallback to any API for method: {name}") diff --git a/openml/_api/resources/base/resources.py b/openml/_api/resources/base/resources.py new file mode 100644 index 000000000..36fcaca15 --- /dev/null +++ b/openml/_api/resources/base/resources.py @@ -0,0 +1,84 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from openml._api.config import ResourceType +from openml._api.resources.base import ResourceAPI + +if TYPE_CHECKING: + import pandas as pd + + from openml._api.clients import HTTPClient, MinIOClient + + +class DatasetAPI(ResourceAPI): + resource_type: ResourceType = ResourceType.DATASET + + def __init__(self, http: HTTPClient, minio: MinIOClient): + self._minio = minio + super().__init__(http) + + +class TaskAPI(ResourceAPI): + resource_type: ResourceType = ResourceType.TASK + + +class EvaluationMeasureAPI(ResourceAPI): + resource_type: ResourceType = ResourceType.EVALUATION_MEASURE + + +class EstimationProcedureAPI(ResourceAPI): + resource_type: ResourceType = ResourceType.ESTIMATION_PROCEDURE + + +class EvaluationAPI(ResourceAPI): + resource_type: ResourceType = ResourceType.EVALUATION + + +class FlowAPI(ResourceAPI): + resource_type: ResourceType = ResourceType.FLOW + + +class StudyAPI(ResourceAPI): + resource_type: ResourceType = ResourceType.STUDY + + def list( # noqa: PLR0913 + self, + limit: int | None = None, + offset: int | None = None, + status: str | None = None, + main_entity_type: str | None = None, + uploader: list[int] | None = None, + benchmark_suite: int | None = None, + ) -> pd.DataFrame: + """List studies from the OpenML server. + + Parameters + ---------- + limit : int, optional + Maximum number of studies to return. + offset : int, optional + Number of studies to skip. + status : str, optional + Filter by status (active, in_preparation, deactivated, all). + main_entity_type : str, optional + Filter by main entity type (run, task). + uploader : list[int], optional + Filter by uploader IDs. + benchmark_suite : int, optional + Filter by benchmark suite ID. + + Returns + ------- + pd.DataFrame + DataFrame containing study information. + """ + raise NotImplementedError("Subclasses must implement list method") + + +class RunAPI(ResourceAPI): + resource_type: ResourceType = ResourceType.RUN + + +class SetupAPI(ResourceAPI): + resource_type: ResourceType = ResourceType.SETUP diff --git a/openml/_api/resources/base/versions.py b/openml/_api/resources/base/versions.py new file mode 100644 index 000000000..88ae87a1c --- /dev/null +++ b/openml/_api/resources/base/versions.py @@ -0,0 +1,154 @@ +from __future__ import annotations + +from collections.abc import Mapping +from typing import Any, cast + +import xmltodict + +from openml._api.config import APIVersion, ResourceType +from openml._api.resources.base import ResourceAPI +from openml.exceptions import ( + OpenMLNotAuthorizedError, + OpenMLServerError, + OpenMLServerException, +) + + +class ResourceV1API(ResourceAPI): + api_version: APIVersion = APIVersion.V1 + + def publish(self, path: str, files: Mapping[str, Any] | None) -> int: + response = self._http.post(path, files=files) + parsed_response = xmltodict.parse(response.content) + return self._extract_id_from_upload(parsed_response) + + def delete(self, resource_id: int) -> bool: + resource_type = self._get_endpoint_name() + + legal_resources = {"data", "flow", "task", "run", "study", "user"} + if resource_type not in legal_resources: + raise ValueError(f"Can't delete a {resource_type}") + + path = f"{resource_type}/{resource_id}" + try: + response = self._http.delete(path) + result = xmltodict.parse(response.content) + return f"oml:{resource_type}_delete" in result + except OpenMLServerException as e: + self._handle_delete_exception(resource_type, e) + raise + + def tag(self, resource_id: int, tag: str) -> list[str]: + resource_type = self._get_endpoint_name() + + legal_resources = {"data", "task", "flow", "setup", "run"} + if resource_type not in legal_resources: + raise ValueError(f"Can't tag a {resource_type}") + + path = f"{resource_type}/tag" + data = {f"{resource_type}_id": resource_id, "tag": tag} + response = self._http.post(path, data=data) + + main_tag = f"oml:{resource_type}_tag" + parsed_response = xmltodict.parse(response.content, force_list={"oml:tag"}) + result = parsed_response[main_tag] + tags: list[str] = result.get("oml:tag", []) + + return tags + + def untag(self, resource_id: int, tag: str) -> list[str]: + resource_type = self._get_endpoint_name() + + legal_resources = {"data", "task", "flow", "setup", "run"} + if resource_type not in legal_resources: + raise ValueError(f"Can't tag a {resource_type}") + + path = f"{resource_type}/untag" + data = {f"{resource_type}_id": resource_id, "tag": tag} + response = self._http.post(path, data=data) + + main_tag = f"oml:{resource_type}_untag" + parsed_response = xmltodict.parse(response.content, force_list={"oml:tag"}) + result = parsed_response[main_tag] + tags: list[str] = result.get("oml:tag", []) + + return tags + + def _get_endpoint_name(self) -> str: + if self.resource_type == ResourceType.DATASET: + return "data" + return cast("str", self.resource_type.value) + + def _handle_delete_exception( + self, resource_type: str, exception: OpenMLServerException + ) -> None: + # https://github.com/openml/OpenML/blob/21f6188d08ac24fcd2df06ab94cf421c946971b0/openml_OS/views/pages/api_new/v1/xml/pre.php + # Most exceptions are descriptive enough to be raised as their standard + # OpenMLServerException, however there are two cases where we add information: + # - a generic "failed" message, we direct them to the right issue board + # - when the user successfully authenticates with the server, + # but user is not allowed to take the requested action, + # in which case we specify a OpenMLNotAuthorizedError. + by_other_user = [323, 353, 393, 453, 594] + has_dependent_entities = [324, 326, 327, 328, 354, 454, 464, 595] + unknown_reason = [325, 355, 394, 455, 593] + if exception.code in by_other_user: + raise OpenMLNotAuthorizedError( + message=( + f"The {resource_type} can not be deleted because it was not uploaded by you." + ), + ) from exception + if exception.code in has_dependent_entities: + raise OpenMLNotAuthorizedError( + message=( + f"The {resource_type} can not be deleted because " + f"it still has associated entities: {exception.message}" + ), + ) from exception + if exception.code in unknown_reason: + raise OpenMLServerError( + message=( + f"The {resource_type} can not be deleted for unknown reason," + " please open an issue at: https://github.com/openml/openml/issues/new" + ), + ) from exception + raise exception + + def _extract_id_from_upload(self, parsed: Mapping[str, Any]) -> int: + # reads id from upload response + # actual parsed dict: {"oml:upload_flow": {"@xmlns:oml": "...", "oml:id": "42"}} + + # xmltodict always gives exactly one root key + ((_, root_value),) = parsed.items() + + if not isinstance(root_value, Mapping): + raise ValueError("Unexpected XML structure") + + # Look for oml:id directly in the root value + if "oml:id" in root_value: + id_value = root_value["oml:id"] + if isinstance(id_value, (str, int)): + return int(id_value) + + # Fallback: check all values for numeric/string IDs + for v in root_value.values(): + if isinstance(v, (str, int)): + return int(v) + + raise ValueError("No ID found in upload response") + + +class ResourceV2API(ResourceAPI): + api_version: APIVersion = APIVersion.V2 + + def publish(self, path: str, files: Mapping[str, Any] | None) -> int: # noqa: ARG002 + self._not_supported(method="publish") + + def delete(self, resource_id: int) -> bool: # noqa: ARG002 + self._not_supported(method="delete") + + def tag(self, resource_id: int, tag: str) -> list[str]: # noqa: ARG002 + self._not_supported(method="tag") + + def untag(self, resource_id: int, tag: str) -> list[str]: # noqa: ARG002 + self._not_supported(method="untag") diff --git a/openml/_api/resources/dataset.py b/openml/_api/resources/dataset.py new file mode 100644 index 000000000..3ecad35da --- /dev/null +++ b/openml/_api/resources/dataset.py @@ -0,0 +1,11 @@ +from __future__ import annotations + +from openml._api.resources.base import DatasetAPI, ResourceV1API, ResourceV2API + + +class DatasetV1API(ResourceV1API, DatasetAPI): + pass + + +class DatasetV2API(ResourceV2API, DatasetAPI): + pass diff --git a/openml/_api/resources/estimation_procedure.py b/openml/_api/resources/estimation_procedure.py new file mode 100644 index 000000000..d2e73cfa6 --- /dev/null +++ b/openml/_api/resources/estimation_procedure.py @@ -0,0 +1,11 @@ +from __future__ import annotations + +from openml._api.resources.base import EstimationProcedureAPI, ResourceV1API, ResourceV2API + + +class EstimationProcedureV1API(ResourceV1API, EstimationProcedureAPI): + pass + + +class EstimationProcedureV2API(ResourceV2API, EstimationProcedureAPI): + pass diff --git a/openml/_api/resources/evaluation.py b/openml/_api/resources/evaluation.py new file mode 100644 index 000000000..a0149e1e5 --- /dev/null +++ b/openml/_api/resources/evaluation.py @@ -0,0 +1,11 @@ +from __future__ import annotations + +from openml._api.resources.base import EvaluationAPI, ResourceV1API, ResourceV2API + + +class EvaluationV1API(ResourceV1API, EvaluationAPI): + pass + + +class EvaluationV2API(ResourceV2API, EvaluationAPI): + pass diff --git a/openml/_api/resources/evaluation_measure.py b/openml/_api/resources/evaluation_measure.py new file mode 100644 index 000000000..bd4318417 --- /dev/null +++ b/openml/_api/resources/evaluation_measure.py @@ -0,0 +1,11 @@ +from __future__ import annotations + +from openml._api.resources.base import EvaluationMeasureAPI, ResourceV1API, ResourceV2API + + +class EvaluationMeasureV1API(ResourceV1API, EvaluationMeasureAPI): + pass + + +class EvaluationMeasureV2API(ResourceV2API, EvaluationMeasureAPI): + pass diff --git a/openml/_api/resources/flow.py b/openml/_api/resources/flow.py new file mode 100644 index 000000000..3b62abd3f --- /dev/null +++ b/openml/_api/resources/flow.py @@ -0,0 +1,11 @@ +from __future__ import annotations + +from openml._api.resources.base import FlowAPI, ResourceV1API, ResourceV2API + + +class FlowV1API(ResourceV1API, FlowAPI): + pass + + +class FlowV2API(ResourceV2API, FlowAPI): + pass diff --git a/openml/_api/resources/run.py b/openml/_api/resources/run.py new file mode 100644 index 000000000..9698c59dd --- /dev/null +++ b/openml/_api/resources/run.py @@ -0,0 +1,11 @@ +from __future__ import annotations + +from openml._api.resources.base import ResourceV1API, ResourceV2API, RunAPI + + +class RunV1API(ResourceV1API, RunAPI): + pass + + +class RunV2API(ResourceV2API, RunAPI): + pass diff --git a/openml/_api/resources/setup.py b/openml/_api/resources/setup.py new file mode 100644 index 000000000..e948e1b38 --- /dev/null +++ b/openml/_api/resources/setup.py @@ -0,0 +1,11 @@ +from __future__ import annotations + +from openml._api.resources.base import ResourceV1API, ResourceV2API, SetupAPI + + +class SetupV1API(ResourceV1API, SetupAPI): + pass + + +class SetupV2API(ResourceV2API, SetupAPI): + pass diff --git a/openml/_api/resources/study.py b/openml/_api/resources/study.py new file mode 100644 index 000000000..d4153f89c --- /dev/null +++ b/openml/_api/resources/study.py @@ -0,0 +1,107 @@ +from __future__ import annotations + +import pandas as pd +import xmltodict + +from openml._api.resources.base import ResourceV1API, ResourceV2API, StudyAPI + + +class StudyV1API(ResourceV1API, StudyAPI): + def list( # noqa: PLR0913 + self, + limit: int | None = None, + offset: int | None = None, + status: str | None = None, + main_entity_type: str | None = None, + uploader: list[int] | None = None, + benchmark_suite: int | None = None, + ) -> pd.DataFrame: + """List studies using V1 API. + + Parameters + ---------- + limit : int, optional + Maximum number of studies to return. + offset : int, optional + Number of studies to skip. + status : str, optional + Filter by status (active, in_preparation, deactivated, all). + main_entity_type : str, optional + Filter by main entity type (run, task). + uploader : list[int], optional + Filter by uploader IDs. + benchmark_suite : int, optional + Filter by benchmark suite ID. + + Returns + ------- + pd.DataFrame + DataFrame containing study information. + """ + # Build the V1 API call string + api_call = "study/list" + + if limit is not None: + api_call += f"/limit/{limit}" + if offset is not None: + api_call += f"/offset/{offset}" + if status is not None: + api_call += f"/status/{status}" + if main_entity_type is not None: + api_call += f"/main_entity_type/{main_entity_type}" + if uploader is not None: + api_call += f"/uploader/{','.join(str(u) for u in uploader)}" + if benchmark_suite is not None: + api_call += f"/benchmark_suite/{benchmark_suite}" + + # Make the GET request + response = self._http.get(api_call) + xml_string = response.content.decode("utf-8") + + # Parse XML and convert to DataFrame + study_dict = xmltodict.parse(xml_string, force_list=("oml:study",)) + + # Minimalistic check if the XML is useful + assert isinstance(study_dict["oml:study_list"]["oml:study"], list), type( + study_dict["oml:study_list"], + ) + assert study_dict["oml:study_list"]["@xmlns:oml"] == "http://openml.org/openml", study_dict[ + "oml:study_list" + ]["@xmlns:oml"] + + studies = {} + for study_ in study_dict["oml:study_list"]["oml:study"]: + # maps from xml name to a tuple of (dict name, casting fn) + expected_fields = { + "oml:id": ("id", int), + "oml:alias": ("alias", str), + "oml:main_entity_type": ("main_entity_type", str), + "oml:benchmark_suite": ("benchmark_suite", int), + "oml:name": ("name", str), + "oml:status": ("status", str), + "oml:creation_date": ("creation_date", str), + "oml:creator": ("creator", int), + } + study_id = int(study_["oml:id"]) + current_study = {} + for oml_field_name, (real_field_name, cast_fn) in expected_fields.items(): + if oml_field_name in study_: + current_study[real_field_name] = cast_fn(study_[oml_field_name]) + current_study["id"] = int(current_study["id"]) + studies[study_id] = current_study + + return pd.DataFrame.from_dict(studies, orient="index") + + +class StudyV2API(ResourceV2API, StudyAPI): + def list( # noqa: PLR0913 + self, + limit: int | None = None, # noqa: ARG002 + offset: int | None = None, # noqa: ARG002 + status: str | None = None, # noqa: ARG002 + main_entity_type: str | None = None, # noqa: ARG002 + uploader: list[int] | None = None, # noqa: ARG002 + benchmark_suite: int | None = None, # noqa: ARG002 + ) -> pd.DataFrame: + """V2 API for listing studies is not yet available.""" + self._not_supported(method="list") diff --git a/openml/_api/resources/task.py b/openml/_api/resources/task.py new file mode 100644 index 000000000..a97d5f726 --- /dev/null +++ b/openml/_api/resources/task.py @@ -0,0 +1,11 @@ +from __future__ import annotations + +from openml._api.resources.base import ResourceV1API, ResourceV2API, TaskAPI + + +class TaskV1API(ResourceV1API, TaskAPI): + pass + + +class TaskV2API(ResourceV2API, TaskAPI): + pass diff --git a/openml/_api/runtime/__init__.py b/openml/_api/runtime/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/openml/_api/runtime/core.py b/openml/_api/runtime/core.py new file mode 100644 index 000000000..9c3ff70a5 --- /dev/null +++ b/openml/_api/runtime/core.py @@ -0,0 +1,190 @@ +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING + +from openml._api.clients import HTTPCache, HTTPClient, MinIOClient +from openml._api.config import Settings +from openml._api.resources import ( + DatasetV1API, + DatasetV2API, + EstimationProcedureV1API, + EstimationProcedureV2API, + EvaluationMeasureV1API, + EvaluationMeasureV2API, + EvaluationV1API, + EvaluationV2API, + FallbackProxy, + FlowV1API, + FlowV2API, + RunV1API, + RunV2API, + SetupV1API, + SetupV2API, + StudyV1API, + StudyV2API, + TaskV1API, + TaskV2API, +) + +if TYPE_CHECKING: + from openml._api.resources.base import ( + DatasetAPI, + EstimationProcedureAPI, + EvaluationAPI, + EvaluationMeasureAPI, + FlowAPI, + RunAPI, + SetupAPI, + StudyAPI, + TaskAPI, + ) + + +class APIBackend: + def __init__( # noqa: PLR0913 + self, + *, + dataset: DatasetAPI | FallbackProxy, + task: TaskAPI | FallbackProxy, + evaluation_measure: EvaluationMeasureAPI | FallbackProxy, + estimation_procedure: EstimationProcedureAPI | FallbackProxy, + evaluation: EvaluationAPI | FallbackProxy, + flow: FlowAPI | FallbackProxy, + study: StudyAPI | FallbackProxy, + run: RunAPI | FallbackProxy, + setup: SetupAPI | FallbackProxy, + ): + self.dataset = dataset + self.task = task + self.evaluation_measure = evaluation_measure + self.estimation_procedure = estimation_procedure + self.evaluation = evaluation + self.flow = flow + self.study = study + self.run = run + self.setup = setup + + @classmethod + def build(cls, version: str, *, strict: bool) -> APIBackend: + settings = Settings.get() + + # Get config for v1. On first access, this triggers lazy initialization + # from openml.config, reading the user's actual API key, server URL, + # cache directory, and retry settings. This avoids circular imports + # (openml.config is imported inside the method, not at module load time) + # and ensures we use the user's configured values rather than hardcoded defaults. + v1_config = settings.get_api_config("v1") + + http_cache = HTTPCache( + path=Path(settings.cache.dir).expanduser(), + ttl=settings.cache.ttl, + ) + minio_client = MinIOClient( + path=Path(settings.cache.dir).expanduser(), + ) + + v1_http_client = HTTPClient( + server=v1_config.server, + base_url=v1_config.base_url, + api_key=v1_config.api_key, + timeout=v1_config.timeout, + retries=settings.connection.retries, + retry_policy=settings.connection.retry_policy, + cache=http_cache, + ) + v1_dataset = DatasetV1API(v1_http_client, minio_client) + v1_task = TaskV1API(v1_http_client) + v1_evaluation_measure = EvaluationMeasureV1API(v1_http_client) + v1_estimation_procedure = EstimationProcedureV1API(v1_http_client) + v1_evaluation = EvaluationV1API(v1_http_client) + v1_flow = FlowV1API(v1_http_client) + v1_study = StudyV1API(v1_http_client) + v1_run = RunV1API(v1_http_client) + v1_setup = SetupV1API(v1_http_client) + + v1 = cls( + dataset=v1_dataset, + task=v1_task, + evaluation_measure=v1_evaluation_measure, + estimation_procedure=v1_estimation_procedure, + evaluation=v1_evaluation, + flow=v1_flow, + study=v1_study, + run=v1_run, + setup=v1_setup, + ) + + if version == "v1": + return v1 + + # V2 support. Currently v2 is not yet available, + # so get_api_config("v2") raises NotImplementedError. When v2 becomes available, + # its config will be added to Settings._init_from_legacy_config(). + # In strict mode: propagate the error. + # In non-strict mode: silently fall back to v1 only. + try: + v2_config = settings.get_api_config("v2") + except NotImplementedError: + if strict: + raise + # Non-strict mode: fall back to v1 only + return v1 + + v2_http_client = HTTPClient( + server=v2_config.server, + base_url=v2_config.base_url, + api_key=v2_config.api_key, + timeout=v2_config.timeout, + retries=settings.connection.retries, + retry_policy=settings.connection.retry_policy, + cache=http_cache, + ) + v2_dataset = DatasetV2API(v2_http_client, minio_client) + v2_task = TaskV2API(v2_http_client) + v2_evaluation_measure = EvaluationMeasureV2API(v2_http_client) + v2_estimation_procedure = EstimationProcedureV2API(v2_http_client) + v2_evaluation = EvaluationV2API(v2_http_client) + v2_flow = FlowV2API(v2_http_client) + v2_study = StudyV2API(v2_http_client) + v2_run = RunV2API(v2_http_client) + v2_setup = SetupV2API(v2_http_client) + + v2 = cls( + dataset=v2_dataset, + task=v2_task, + evaluation_measure=v2_evaluation_measure, + estimation_procedure=v2_estimation_procedure, + evaluation=v2_evaluation, + flow=v2_flow, + study=v2_study, + run=v2_run, + setup=v2_setup, + ) + + if strict: + return v2 + + fallback_dataset = FallbackProxy(v1_dataset, v2_dataset) + fallback_task = FallbackProxy(v1_task, v2_task) + fallback_evaluation_measure = FallbackProxy(v1_evaluation_measure, v2_evaluation_measure) + fallback_estimation_procedure = FallbackProxy( + v1_estimation_procedure, v2_estimation_procedure + ) + fallback_evaluation = FallbackProxy(v1_evaluation, v2_evaluation) + fallback_flow = FallbackProxy(v1_flow, v2_flow) + fallback_study = FallbackProxy(v1_study, v2_study) + fallback_run = FallbackProxy(v1_run, v2_run) + fallback_setup = FallbackProxy(v1_setup, v2_setup) + + return cls( + dataset=fallback_dataset, + task=fallback_task, + evaluation_measure=fallback_evaluation_measure, + estimation_procedure=fallback_estimation_procedure, + evaluation=fallback_evaluation, + flow=fallback_flow, + study=fallback_study, + run=fallback_run, + setup=fallback_setup, + ) diff --git a/openml/_api/runtime/instance.py b/openml/_api/runtime/instance.py new file mode 100644 index 000000000..633d3f372 --- /dev/null +++ b/openml/_api/runtime/instance.py @@ -0,0 +1,5 @@ +from __future__ import annotations + +from openml._api.runtime.core import APIBackend + +_backend: APIBackend = APIBackend.build(version="v1", strict=False) diff --git a/openml/exceptions.py b/openml/exceptions.py index fe63b8a58..26c2d2591 100644 --- a/openml/exceptions.py +++ b/openml/exceptions.py @@ -65,3 +65,7 @@ class OpenMLNotAuthorizedError(OpenMLServerError): class ObjectNotPublishedError(PyOpenMLError): """Indicates an object has not been published yet.""" + + +class OpenMLNotSupportedError(PyOpenMLError): + """Raised when an API operation is not supported for a resource/version.""" diff --git a/openml/study/functions.py b/openml/study/functions.py index bb24ddcff..755eb2293 100644 --- a/openml/study/functions.py +++ b/openml/study/functions.py @@ -3,7 +3,7 @@ import warnings from functools import partial -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING import pandas as pd import xmltodict @@ -11,6 +11,7 @@ import openml._api_calls import openml.config import openml.utils +from openml._api import api_context from openml.study.study import OpenMLBenchmarkSuite, OpenMLStudy if TYPE_CHECKING: @@ -337,7 +338,8 @@ def delete_study(study_id: int) -> bool: bool True iff the deletion was successful. False otherwise """ - return openml.utils._delete_entity("study", study_id) + result: bool = api_context.backend.study.delete(study_id) + return result def attach_to_suite(suite_id: int, task_ids: list[int]) -> int: @@ -466,7 +468,7 @@ def list_suites( - creation_date """ listing_call = partial( - _list_studies, + api_context.backend.study.list, main_entity_type="task", status=status, uploader=uploader, @@ -482,7 +484,7 @@ def list_studies( offset: int | None = None, size: int | None = None, status: str | None = None, - uploader: list[str] | None = None, + uploader: list[int] | None = None, benchmark_suite: int | None = None, ) -> pd.DataFrame: """ @@ -517,7 +519,7 @@ def list_studies( these are also returned. """ listing_call = partial( - _list_studies, + api_context.backend.study.list, main_entity_type="run", status=status, uploader=uploader, @@ -528,81 +530,3 @@ def list_studies( return pd.DataFrame() return pd.concat(batches) - - -def _list_studies(limit: int, offset: int, **kwargs: Any) -> pd.DataFrame: - """Perform api call to return a list of studies. - - Parameters - ---------- - limit: int - The maximum number of studies to return. - offset: int - The number of studies to skip, starting from the first. - kwargs : dict, optional - Legal filter operators (keys in the dict): - status, main_entity_type, uploader, benchmark_suite - - Returns - ------- - studies : dataframe - """ - api_call = "study/list" - if limit is not None: - api_call += f"/limit/{limit}" - if offset is not None: - api_call += f"/offset/{offset}" - if kwargs is not None: - for operator, value in kwargs.items(): - if value is not None: - api_call += f"/{operator}/{value}" - return __list_studies(api_call=api_call) - - -def __list_studies(api_call: str) -> pd.DataFrame: - """Retrieves the list of OpenML studies and - returns it in a dictionary or a Pandas DataFrame. - - Parameters - ---------- - api_call : str - The API call for retrieving the list of OpenML studies. - - Returns - ------- - pd.DataFrame - A Pandas DataFrame of OpenML studies - """ - xml_string = openml._api_calls._perform_api_call(api_call, "get") - study_dict = xmltodict.parse(xml_string, force_list=("oml:study",)) - - # Minimalistic check if the XML is useful - assert isinstance(study_dict["oml:study_list"]["oml:study"], list), type( - study_dict["oml:study_list"], - ) - assert study_dict["oml:study_list"]["@xmlns:oml"] == "http://openml.org/openml", study_dict[ - "oml:study_list" - ]["@xmlns:oml"] - - studies = {} - for study_ in study_dict["oml:study_list"]["oml:study"]: - # maps from xml name to a tuple of (dict name, casting fn) - expected_fields = { - "oml:id": ("id", int), - "oml:alias": ("alias", str), - "oml:main_entity_type": ("main_entity_type", str), - "oml:benchmark_suite": ("benchmark_suite", int), - "oml:name": ("name", str), - "oml:status": ("status", str), - "oml:creation_date": ("creation_date", str), - "oml:creator": ("creator", int), - } - study_id = int(study_["oml:id"]) - current_study = {} - for oml_field_name, (real_field_name, cast_fn) in expected_fields.items(): - if oml_field_name in study_: - current_study[real_field_name] = cast_fn(study_[oml_field_name]) - current_study["id"] = int(current_study["id"]) - studies[study_id] = current_study - - return pd.DataFrame.from_dict(studies, orient="index") diff --git a/openml/testing.py b/openml/testing.py index 8d3bbbd5b..b0aaac9be 100644 --- a/openml/testing.py +++ b/openml/testing.py @@ -11,10 +11,13 @@ import unittest from pathlib import Path from typing import ClassVar +from urllib.parse import urljoin import requests import openml +from openml._api.clients import HTTPCache, HTTPClient +from openml._api.config import RetryPolicy from openml.exceptions import OpenMLServerException from openml.tasks import TaskType @@ -276,6 +279,91 @@ def _check_fold_timing_evaluations( # noqa: PLR0913 assert evaluation <= max_val +class TestAPIBase(unittest.TestCase): + server: str + base_url: str + api_key: str + timeout: int + retries: int + retry_policy: RetryPolicy + dir: str + ttl: int + cache: HTTPCache + http_client: HTTPClient + + def setUp(self) -> None: + self.server = "https://test.openml.org/" + self.base_url = "api/v1/xml" + self.api_key = "normaluser" + self.timeout = 10 + self.retries = 3 + self.retry_policy = RetryPolicy.HUMAN + self.dir = "test_cache" + self.ttl = 60 * 60 * 24 * 7 + + self.cache = self._get_http_cache( + path=Path(self.dir), + ttl=self.ttl, + ) + self.http_client = self._get_http_client( + server=self.server, + base_url=self.base_url, + api_key=self.api_key, + timeout=self.timeout, + retries=self.retries, + retry_policy=self.retry_policy, + cache=self.cache, + ) + + if self.cache.path.exists(): + shutil.rmtree(self.cache.path) + + def tearDown(self) -> None: + if self.cache.path.exists(): + shutil.rmtree(self.cache.path) + + def _get_http_cache( + self, + path: Path, + ttl: int, + ) -> HTTPCache: + return HTTPCache( + path=path, + ttl=ttl, + ) + + def _get_http_client( # noqa: PLR0913 + self, + server: str, + base_url: str, + api_key: str, + timeout: int, + retries: int, + retry_policy: RetryPolicy, + cache: HTTPCache | None = None, + ) -> HTTPClient: + return HTTPClient( + server=server, + base_url=base_url, + api_key=api_key, + timeout=timeout, + retries=retries, + retry_policy=retry_policy, + cache=cache, + ) + + def _get_url( + self, + server: str | None = None, + base_url: str | None = None, + path: str | None = None, + ) -> str: + server = server if server else self.server + base_url = base_url if base_url else self.base_url + path = path if path else "" + return urljoin(self.server, urljoin(self.base_url, path)) + + def check_task_existence( task_type: TaskType, dataset_id: int, diff --git a/tests/test_api/__init__.py b/tests/test_api/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/test_api/test_http.py b/tests/test_api/test_http.py new file mode 100644 index 000000000..efaeaeeef --- /dev/null +++ b/tests/test_api/test_http.py @@ -0,0 +1,161 @@ +from requests import Response, Request +import time +import xmltodict +import pytest +from openml.testing import TestAPIBase +import os + + +class TestHTTPClient(TestAPIBase): + def test_cache(self): + url = self._get_url(path="task/31") + params = {"param1": "value1", "param2": "value2"} + + key = self.cache.get_key(url, params) + expected_key = os.path.join( + "org", + "openml", + "test", + "api", + "v1", + "task", + "31", + "param1=value1¶m2=value2", + ) + + # validate key + self.assertEqual(key, expected_key) + + # create fake response + req = Request("GET", url).prepare() + response = Response() + response.status_code = 200 + response.url = url + response.reason = "OK" + response._content = b"test" + response.headers = {"Content-Type": "text/xml"} + response.encoding = "utf-8" + response.request = req + response.elapsed = type("Elapsed", (), {"total_seconds": lambda self: 0.1})() + + # save to cache + self.cache.save(key, response) + + # load from cache + cached_response = self.cache.load(key) + + # validate loaded response + self.assertEqual(cached_response.status_code, 200) + self.assertEqual(cached_response.url, url) + self.assertEqual(cached_response.content, b"test") + self.assertEqual( + cached_response.headers["Content-Type"], "text/xml" + ) + + @pytest.mark.uses_test_server() + def test_get(self): + response = self.http_client.get("task/1") + + self.assertEqual(response.status_code, 200) + self.assertIn(b" new request + self.assertNotEqual(response1_cache_time_stamp, response2_cache_time_stamp) + self.assertEqual(response2.status_code, 200) + self.assertEqual(response1.content, response2.content) + + @pytest.mark.uses_test_server() + def test_get_reset_cache(self): + path = "task/1" + + url = self._get_url(path=path) + key = self.cache.get_key(url, {}) + cache_path = self.cache._key_to_path(key) / "meta.json" + + response1 = self.http_client.get(path, use_cache=True) + response1_cache_time_stamp = cache_path.stat().st_ctime + + response2 = self.http_client.get(path, use_cache=True, reset_cache=True) + response2_cache_time_stamp = cache_path.stat().st_ctime + + self.assertNotEqual(response1_cache_time_stamp, response2_cache_time_stamp) + self.assertEqual(response2.status_code, 200) + self.assertEqual(response1.content, response2.content) + + @pytest.mark.uses_test_server() + def test_post_and_delete(self): + task_xml = """ + + 5 + 193 + 17 + + """ + + task_id = None + try: + # POST the task + post_response = self.http_client.post( + "task", + files={"description": task_xml}, + ) + self.assertEqual(post_response.status_code, 200) + xml_resp = xmltodict.parse(post_response.content) + task_id = int(xml_resp["oml:upload_task"]["oml:id"]) + + # GET the task to verify it exists + get_response = self.http_client.get(f"task/{task_id}") + self.assertEqual(get_response.status_code, 200) + + finally: + # DELETE the task if it was created + if task_id is not None: + del_response = self.http_client.delete(f"task/{task_id}") + self.assertEqual(del_response.status_code, 200) diff --git a/tests/test_api/test_studies.py b/tests/test_api/test_studies.py new file mode 100644 index 000000000..3ed365222 --- /dev/null +++ b/tests/test_api/test_studies.py @@ -0,0 +1,94 @@ +# License: BSD 3-Clause +from __future__ import annotations + +import pytest + +from openml._api.resources.study import StudyV1API, StudyV2API +from openml.exceptions import OpenMLNotSupportedError +from openml.testing import TestAPIBase + + +class TestStudyV1API(TestAPIBase): + """Tests for V1 XML API implementation of studies.""" + + def setUp(self) -> None: + super().setUp() + self.api = StudyV1API(self.http_client) + + @pytest.mark.uses_test_server() + def test_list_basic(self): + """Test basic list functionality with limit and offset.""" + studies_df = self.api.list(limit=5, offset=0) + + assert studies_df is not None + assert len(studies_df) <= 5 + + expected_columns = {"id", "alias", "main_entity_type", "name", "status"} + assert expected_columns.issubset(set(studies_df.columns)) + + @pytest.mark.uses_test_server() + def test_list_with_filters(self): + """Test list with various filters.""" + studies_df = self.api.list( + limit=10, + offset=0, + status="active", + ) + if len(studies_df) > 0: + assert all(studies_df["status"] == "active") + + @pytest.mark.uses_test_server() + def test_list_pagination(self): + """Test pagination with offset and limit.""" + try: + page1 = self.api.list(limit=3, offset=0) + + if len(page1) >= 3: + page2 = self.api.list(limit=3, offset=3) + + if len(page2) > 0: + page1_ids = set(page1["id"]) + page2_ids = set(page2["id"]) + assert page1_ids.isdisjoint(page2_ids) + except Exception: + pytest.skip("Not enough studies on test server for pagination test") + + @pytest.mark.uses_test_server() + def test_delete(self): + """Test delete method (inherited from ResourceV1API).""" + assert hasattr(self.api, "delete") + assert callable(self.api.delete) + + +class TestStudyV2API(TestAPIBase): + """Tests for V2 API implementation of studies.""" + + def setUp(self) -> None: + super().setUp() + self.api = StudyV2API(self.http_client) + + def test_list_not_supported(self): + """Test that list raises OpenMLNotSupportedError for V2.""" + with pytest.raises(OpenMLNotSupportedError): + self.api.list(limit=10, offset=0) + + +class TestStudyCombined(TestAPIBase): + """Combined tests for study API fallback behavior.""" + + def setUp(self) -> None: + super().setUp() + self.v1_api = StudyV1API(self.http_client) + self.v2_api = StudyV2API(self.http_client) + + @pytest.mark.uses_test_server() + def test_v1_v2_compatibility(self): + """Verify V1 and V2 APIs have compatible interfaces.""" + # Both should have the same method names + assert hasattr(self.v1_api, "list") + assert hasattr(self.v2_api, "list") + + # Both should have delete, tag, untag from base + for method in ["delete", "tag", "untag", "publish"]: + assert hasattr(self.v1_api, method) + assert hasattr(self.v2_api, method) \ No newline at end of file diff --git a/tests/test_api/test_versions.py b/tests/test_api/test_versions.py new file mode 100644 index 000000000..2203ab6da --- /dev/null +++ b/tests/test_api/test_versions.py @@ -0,0 +1,44 @@ +import pytest +from openml.testing import TestAPIBase +from openml._api.resources.base.versions import ResourceV1API +from openml._api.config import ResourceType + + +class TestResourceV1API(TestAPIBase): + def setUp(self): + super().setUp() + self.resource = ResourceV1API(self.http_client) + self.resource.resource_type = ResourceType.TASK + + @pytest.mark.uses_test_server() + def test_publish_and_delete(self): + task_xml = """ + + 5 + 193 + 17 + + """ + + task_id = None + try: + # Publish the task + task_id = self.resource.publish( + "task", + files={"description": task_xml}, + ) + + # Get the task to verify it exists + get_response = self.http_client.get(f"task/{task_id}") + self.assertEqual(get_response.status_code, 200) + + finally: + # delete the task if it was created + if task_id is not None: + success = self.resource.delete(task_id) + self.assertTrue(success) + + + @pytest.mark.uses_test_server() + def test_tag_and_untag(self): + pass diff --git a/tests/test_studies/test_studies_migration.py b/tests/test_studies/test_studies_migration.py new file mode 100644 index 000000000..85d764755 --- /dev/null +++ b/tests/test_studies/test_studies_migration.py @@ -0,0 +1,119 @@ +# License: BSD 3-Clause +from __future__ import annotations + +from collections import OrderedDict +from typing import Any + +import pandas as pd +import pytest +import requests + +import openml +from openml._api import api_context +from openml.exceptions import OpenMLCacheException +from openml.study import OpenMLStudy +from openml.study import functions as study_functions + + +@pytest.fixture(scope="function") +def reset_api_to_v1() -> None: + """Fixture to ensure API is set to V1 for each test.""" + api_context.set_version("v1", strict=False) + yield + api_context.set_version("v1", strict=False) + + +@pytest.fixture(scope="function") +def api_v2() -> None: + """Fixture to set API to V2 for tests.""" + api_context.set_version("v2", strict=True) + yield + api_context.set_version("v1", strict=False) + + +def test_list_studies_v1(reset_api_to_v1) -> None: + """Test listing studies using V1 API.""" + studies_df = study_functions.list_studies() + assert isinstance(studies_df, pd.DataFrame) + assert not studies_df.empty + + +def test_study_exists_v1(reset_api_to_v1) -> None: + """Test study_exists() using V1 API.""" + # Known existing study + name = "weka.OneR" + external_version = "Weka_3.9.0_10153" + + exists = study_functions.study_exists(name, external_version) + assert exists is not False + + # Known non-existing study + name = "non.existing.study" + external_version = "0.0.1" + + exists = study_functions.study_exists(name, external_version) + assert exists is False + + +def test_get_studies_v1(reset_api_to_v1) -> None: + """Test get() method returns a valid OpenMLstudy object using V1 API.""" + # Get the study with ID 2 (weka.OneR) + study_id = 2 + study = study_functions.get_study(study_id) + + assert isinstance(study, OpenMStudy) + assert study.study_id == study_id + assert isinstance(study.name, str) + assert len(study.name) > 0 + assert isinstance(study.external_version, str) + + +def test_study_publish_v1(reset_api_to_v1) -> None: + """Test publishing a study using V1 API.""" + from openml_sklearn.extension import SklearnExtension + from sklearn.tree import DecisionTreeClassifier + + clf = DecisionTreeClassifier() + extension = SklearnExtension() + dt_study = extension.model_to_study(clf) + + # Publish the study + published_study = dt_study.publish() + + # Verify the published study has an ID + assert isinstance(published_study, OpenMLstudy) + assert getattr(published_study, "id", None) is not None + + +def test_get_studies_v2(api_v2) -> None: + """Test get() method returns a valid OpenMLstudy object using V2 API.""" + # Get the study with ID 2 (weka.OneR) + study_id = 2 + + # Now get the full study details + study = study_functions.get_study(study_id) + + # Verify it's an OpenMLstudy with expected attributes + assert isinstance(study, OpenMLstudy) + assert study.study_id == study_id + assert isinstance(study.name, str) + assert len(study.name) > 0 + assert isinstance(study.external_version, str) + + +def test_study_exists_v2(api_v2) -> None: + """Test study_exists() using V2 API.""" + # Known existing study + name = "weka.OneR" + external_version = "Weka_3.9.0_10153" + + exists = study_functions.study_exists(name, external_version) + assert exists != False + + # Known non-existing study + name = "non.existing.study" + external_version = "0.0.1" + + exists = study_functions.study_exists(name, external_version) + assert exists == False + \ No newline at end of file