{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Spacer training Notebook\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "execution": { "iopub.execute_input": "2026-05-22T00:53:53.402352Z", "iopub.status.busy": "2026-05-22T00:53:53.402139Z", "iopub.status.idle": "2026-05-22T00:53:53.407402Z", "shell.execute_reply": "2026-05-22T00:53:53.406352Z" } }, "outputs": [], "source": [ "\n", "# ── Example data paths (swap these for your own files) ──────────────────────\n", "# Run `python create_example_data.py` once to generate the files below.\n", "\n", "GENE_CSV = 'data/example_genes.csv' # reference gene list (377 genes)\n", "ADATA_H5AD = 'data/example_spatial.h5ad' # 600 cells × 98 genes, spatial grid\n", "SAMPLE_CSV = 'data/example_sample.csv' # multi-sample CSV (points at h5ad above)\n", "IMMUNE_CELL = 'macrophage' # 'tcell' | 'macrophage' | 'bcell' etc.\n", "\n", "# Synthetic label layout:\n", "# T-cell infiltration → upper half (Y < 200)\n", "# Macrophage infiltration → right half (X > 190)\n", "#\n", "# To train on real data, replace the three variables above and set\n", "# immune_cell, radius, resolution in the BagsDataset call below.\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "execution": { "iopub.execute_input": "2026-05-22T00:53:53.408889Z", "iopub.status.busy": "2026-05-22T00:53:53.408686Z", "iopub.status.idle": "2026-05-22T00:53:58.857333Z", "shell.execute_reply": "2026-05-22T00:53:58.856661Z" } }, "outputs": [], "source": [ "\n", "import os\n", "import torch\n", "import torch.nn as nn\n", "import torch.optim as optim\n", "import pandas as pd\n", "import numpy as np\n", "import scanpy as sc\n", "from torch.utils.data import DataLoader, random_split\n", "from sklearn.metrics import roc_auc_score\n", "from tqdm import tqdm\n", "from model.dataset import BagsDataset, custom_collate_fn\n", "from model.model import MIL, EarlyStopping" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "execution": { "iopub.execute_input": "2026-05-22T00:53:58.860741Z", "iopub.status.busy": "2026-05-22T00:53:58.860308Z", "iopub.status.idle": "2026-05-22T00:53:58.863529Z", "shell.execute_reply": "2026-05-22T00:53:58.863097Z" } }, "outputs": [], "source": [ "\n", "def load_all_genes(reference_gene_file):\n", " all_genes = pd.read_csv(reference_gene_file)\n", " return all_genes['Gene'].values.tolist()\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\n", "# Load reference gene list\n", "all_genes = pd.read_csv(GENE_CSV) # swap GENE_CSV for 'data/human_filtered.csv' on real data\n", "all_genes\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "execution": { "iopub.execute_input": "2026-05-22T00:53:58.865286Z", "iopub.status.busy": "2026-05-22T00:53:58.865044Z", "iopub.status.idle": "2026-05-22T00:53:58.879231Z", "shell.execute_reply": "2026-05-22T00:53:58.878377Z" } }, "outputs": [ { "data": { "text/html": [ "
| \n", " | Gene | \n", "
|---|---|
| 0 | \n", "TAP2 | \n", "
| 1 | \n", "IFI6 | \n", "
| 2 | \n", "TOP2A | \n", "
| 3 | \n", "PBK | \n", "
| 4 | \n", "TPX2 | \n", "
| ... | \n", "... | \n", "
| 372 | \n", "GENE0295 | \n", "
| 373 | \n", "GENE0296 | \n", "
| 374 | \n", "GENE0297 | \n", "
| 375 | \n", "GENE0298 | \n", "
| 376 | \n", "GENE0299 | \n", "
377 rows × 1 columns
\n", "| \n", " | adata | \n", "radius | \n", "resolution | \n", "
|---|---|---|---|
| 0 | \n", "data/example_spatial.h5ad | \n", "50 | \n", "low | \n", "
| 1 | \n", "data/example_spatial.h5ad | \n", "50 | \n", "low | \n", "