{
  "cells": [
    {
      "cell_type": "markdown",
      "id": "cafd1f2a",
      "metadata": {},
      "source": [
        "# チャンキングサービスの静的データセット例\n",
        "\n",
        "**要件：**DataRobot Python SDK、および`SAMPLE_DATA_TO_START_PROJECT`機能フラグが有効になっていること。\n",
        "\n",
        "パイプラインヘルパーでは、モジュールレベルの`DR_API_TOKEN`および`DR_ENDPOINT`（資格情報セルで設定されたもの）を使用します。\n"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "8f66a816",
      "metadata": {},
      "source": [
        "## 認証"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "1285ee1d",
      "metadata": {},
      "outputs": [],
      "source": [
        "import os\n",
        "\n",
        "DR_API_TOKEN = os.environ.get(\"DR_API_TOKEN\", \"\")\n",
        "DR_ENDPOINT = os.environ.get(\"DR_ENDPOINT\", \"\")\n",
        "\n",
        "if not DR_API_TOKEN or not DR_ENDPOINT:\n",
        "    raise ValueError(\"Set DR_API_TOKEN and DR_ENDPOINT in the environment or edit this cell.\")\n"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "b6087b69",
      "metadata": {},
      "source": [
        "## ライブラリのインポート"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "e72ee3a3",
      "metadata": {},
      "outputs": [],
      "source": [
        "from typing import Any, Optional\n",
        "\n",
        "import datarobot as dr\n",
        "from datarobot import UseCase\n",
        "from datarobot.enums import ChunkingPartitionMethod, ChunkingStrategy\n",
        "from datarobot.models.chunking_service_v2 import ChunkDefinition, DatasetDefinition\n",
        "from datarobot.models.project import Project\n",
        "\n",
        "_ = dr.Client(token=DR_API_TOKEN, endpoint=DR_ENDPOINT)"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "fc790978",
      "metadata": {},
      "source": [
        "## パイプライン関数の設定\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "eb4a47c5",
      "metadata": {},
      "outputs": [],
      "source": [
        "def add_project_to_use_case(use_case_id: str, project_id: str) -> None:\n",
        "    project = dr.Project.get(project_id)\n",
        "    use_case = UseCase.get(use_case_id=use_case_id)\n",
        "    use_case.add(entity=project)\n",
        "\n",
        "\n",
        "def run_ssp_ai_catalog(\n",
        "    dataset_id: str,\n",
        "    target_column: str,\n",
        "    target_class: Optional[str] = None,\n",
        "    dataset_version_id: Optional[str] = None,\n",
        "    use_case_id: Optional[str] = None,\n",
        "    datetime_partition_column: Optional[str] = None,\n",
        "    chunking_partition_method: Optional[ChunkingPartitionMethod] = ChunkingPartitionMethod.RANDOM,\n",
        ") -> Project:\n",
        "    \"\"\"Create dataset + chunk definitions and a project with sample-to-start + incremental learning.\"\"\"\n",
        "    dataset_definition = DatasetDefinition.create(dataset_id, dataset_version_id)\n",
        "    DatasetDefinition.analyze(dataset_definition.id)\n",
        "    dataset_definition = DatasetDefinition.get(dataset_definition.id)\n",
        "\n",
        "    partition_args: dict[str, Any] = {}\n",
        "    if chunking_partition_method == ChunkingPartitionMethod.RANDOM:\n",
        "        partition_args[\"target_column\"] = None\n",
        "        partition_args[\"target_class\"] = None\n",
        "        partition_args[\"datetime_partition_column\"] = None\n",
        "    elif chunking_partition_method == ChunkingPartitionMethod.STRATIFIED:\n",
        "        partition_args[\"target_column\"] = target_column\n",
        "        partition_args[\"target_class\"] = target_class\n",
        "        partition_args[\"datetime_partition_column\"] = None\n",
        "    else:\n",
        "        partition_args[\"target_column\"] = None\n",
        "        partition_args[\"target_class\"] = None\n",
        "        partition_args[\"datetime_partition_column\"] = datetime_partition_column\n",
        "\n",
        "    chunk_definition = ChunkDefinition.create(\n",
        "        dataset_definition.id,\n",
        "        partition_method=chunking_partition_method,\n",
        "        chunking_strategy_type=ChunkingStrategy.ROWS,\n",
        "        **partition_args,\n",
        "    )\n",
        "    ChunkDefinition.analyze(dataset_definition.id, chunk_definition.id)\n",
        "    chunk_definition = ChunkDefinition.get(dataset_definition.id, chunk_definition.id)\n",
        "\n",
        "    partition_label = (chunking_partition_method or ChunkingPartitionMethod.RANDOM).value.capitalize()\n",
        "    project: Project = dr.Project.create_from_dataset(\n",
        "        dataset_id,\n",
        "        project_name=f\"Sample to Start Project {partition_label} - Target: {target_column}\",\n",
        "        use_sample_from_dataset=True,\n",
        "        max_wait=6000,\n",
        "    )\n",
        "\n",
        "    project_partitioning_method = None\n",
        "    if datetime_partition_column is not None:\n",
        "        print(f\"Setting up datetime partitioning for project {project.id}\")\n",
        "        spec = dr.DatetimePartitioningSpecification(\n",
        "            datetime_partition_column=datetime_partition_column,\n",
        "            use_time_series=False,\n",
        "        )\n",
        "        full_part = dr.DatetimePartitioning.generate_optimized(project.id, spec, target_column)\n",
        "        project_partitioning_method = dr.helpers.partitioning_methods.DatetimePartitioningId(\n",
        "            full_part.datetime_partitioning_id, project.id\n",
        "        )\n",
        "        print(f\"Datetime partitioning set for project {project.id}\")\n",
        "\n",
        "    if use_case_id is not None:\n",
        "        add_project_to_use_case(use_case_id=use_case_id, project_id=project.id)\n",
        "\n",
        "    advanced_options = dr.helpers.AdvancedOptions(\n",
        "        incremental_learning_only_mode=True,\n",
        "        incremental_learning_on_best_model=True,\n",
        "        chunk_definition_id=chunk_definition.id,\n",
        "        incremental_learning_early_stopping_rounds=0,\n",
        "    )\n",
        "\n",
        "    project.analyze_and_model(\n",
        "        target=target_column,\n",
        "        mode=dr.enums.AUTOPILOT_MODE.QUICK,\n",
        "        partitioning_method=project_partitioning_method,\n",
        "        advanced_options=advanced_options,\n",
        "        worker_count=-1,\n",
        "        max_wait=6000,\n",
        "    )\n",
        "    return project\n"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "160f1b59",
      "metadata": {},
      "source": [
        "## 設定と実行\n",
        "\n",
        "変数を編集してから実行します。**STRATIFIED**には`TARGET_CLASS`が必要です。**DATE**には`DATETIME_PARTITION_COLUMN`が必要です。\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "cefb9914",
      "metadata": {},
      "outputs": [],
      "source": [
        "# --- edit these ---\n",
        "DATASET_ID = \"your-dataset-id\"\n",
        "TARGET_COLUMN = \"your_target\"\n",
        "DATASET_VERSION_ID = None\n",
        "USE_CASE_ID = None\n",
        "TARGET_CLASS = None\n",
        "DATETIME_PARTITION_COLUMN = None\n",
        "CHUNKING_PARTITION_METHOD = \"RANDOM\"  # RANDOM | STRATIFIED | DATE\n",
        "\n",
        "method = ChunkingPartitionMethod[CHUNKING_PARTITION_METHOD.upper()]\n",
        "if method == ChunkingPartitionMethod.DATE and not DATETIME_PARTITION_COLUMN:\n",
        "    raise ValueError(\"DATETIME_PARTITION_COLUMN required for DATE\")\n",
        "if method == ChunkingPartitionMethod.STRATIFIED and not TARGET_CLASS:\n",
        "    raise ValueError(\"TARGET_CLASS required for STRATIFIED\")\n",
        "\n",
        "project = run_ssp_ai_catalog(\n",
        "    DATASET_ID,\n",
        "    TARGET_COLUMN,\n",
        "    target_class=TARGET_CLASS,\n",
        "    dataset_version_id=DATASET_VERSION_ID,\n",
        "    use_case_id=USE_CASE_ID,\n",
        "    datetime_partition_column=DATETIME_PARTITION_COLUMN,\n",
        "    chunking_partition_method=method,\n",
        ")\n",
        "print(f\"Project ID: {project.id}\")\n"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3 (ipykernel)",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.8.12"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
}