{ "cells": [ { "cell_type": "code", "execution_count": 2, "id": "66924c9f-619a-44bf-a0e2-c6e845e02184", "metadata": {}, "outputs": [], "source": [ "import scanpy as sc\n", "import anndata\n", "import numpy as np\n", "import os\n", "import glob\n", "import pandas as pd\n", "from tqdm import tqdm\n", "import torch" ] }, { "cell_type": "code", "execution_count": 6, "id": "05acc135", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 14/14 [03:59<00:00, 17.12s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Created combined embeddings AnnData with 100648790 cells and 50 latent dimensions\n" ] } ], "source": [ "# Function to load cell obs data from backed h5ad files\n", "def load_cell_obs(plate_num):\n", " plate_path = f\"/home/ubuntu/data/raw/plate{plate_num}_filt_Vevo_Tahoe100M_WServicesFrom_ParseGigalab.h5ad\"\n", " # Load only the obs data, using backed mode to save memory\n", " adata = sc.read_h5ad(plate_path, backed='r')\n", " cell_obs = adata.obs.copy()\n", " cell_names = adata.obs_names.tolist()\n", " return cell_names, cell_obs\n", "\n", "# Function to load embeddings from numpy files\n", "def load_embeddings(plate_num):\n", " embedding_path = f'plate{plate_num}_embeddings.npy'\n", " return np.load(embedding_path)\n", "\n", "# Load all cell obs data and embeddings\n", "all_cell_names = []\n", "all_cell_obs = []\n", "all_embeddings = []\n", "\n", "# Process each plate (1-14)\n", "for i in tqdm(range(1, 15)):\n", " cell_names, cell_obs = load_cell_obs(i)\n", " embeddings = load_embeddings(i)\n", " \n", " # Ensure the number of cells matches the number of embeddings\n", " assert len(cell_names) == embeddings.shape[0], f\"Mismatch in plate {i}: {len(cell_names)} cells vs {embeddings.shape[0]} embeddings\"\n", " \n", " all_cell_names.extend(cell_names)\n", " all_cell_obs.append(cell_obs)\n", " all_embeddings.append(embeddings)\n", "\n", "# Combine all embeddings into a single array\n", "combined_embeddings = np.vstack(all_embeddings)\n", "\n", "# Combine all cell observations\n", "combined_obs = pd.concat(all_cell_obs)\n", "\n", "# Create an AnnData object with the combined embeddings\n", "combined_ad = anndata.AnnData(\n", " X=combined_embeddings,\n", " obs=combined_obs\n", ")\n", "\n", "print(f\"Created combined embeddings AnnData with {combined_ad.shape[0]} cells and {combined_ad.shape[1]} latent dimensions\")\n" ] }, { "cell_type": "code", "execution_count": 62, "id": "e657516a", "metadata": {}, "outputs": [], "source": [ "combined_ad.write_h5ad(\"/home/ubuntu/data/ldvae_embeddings.h5ad\")" ] }, { "cell_type": "code", "execution_count": 15, "id": "ce282c9d", "metadata": {}, "outputs": [], "source": [ "edist = pd.read_csv(\"/home/ubuntu/tahoe/clin-oracle-tahoe-hack-2025/data/tahoe_largest_edist_per_drug_cell_line_combo.csv\")" ] }, { "cell_type": "code", "execution_count": 17, "id": "f15c9a3a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "View of AnnData object with n_obs × n_vars = 0 × 50\n", " obs: 'sample', 'gene_count', 'tscp_count', 'mread_count', 'drugname_drugconc', 'drug', 'cell_line', 'sublibrary', 'BARCODE', 'pcnt_mito', 'S_score', 'G2M_score', 'phase', 'pass_filter', 'cell_name', 'plate'" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "combined_ad[combined_ad.obs[\"drugname_drugconc\"].isin(edist[\"drug_info\"])]" ] }, { "cell_type": "code", "execution_count": 22, "id": "15f202ba", "metadata": {}, "outputs": [], "source": [ "combined_ad_filt = combined_ad[combined_ad.obs[\"drugname_drugconc\"].apply(lambda x: x.strip('[]')).isin(edist[\"drug_info\"])]" ] }, { "cell_type": "code", "execution_count": null, "id": "b4e19a2f", "metadata": {}, "outputs": [], "source": [ "# Aggregate control data by cell line\n", "agg_control = sc.get.aggregate(combined_ad[combined_ad.obs[\"drug\"]==\"DMSO_TF\"], by=[\"cell_line\"], func=[\"mean\"])\n", "\n", "# Aggregate drug data by cell line and drug\n", "agg_drug = sc.get.aggregate(combined_ad_filt, by=[\"cell_line\", \"drugname_drugconc\"], func=[\"mean\"])" ] }, { "cell_type": "code", "execution_count": 31, "id": "9df0969b", "metadata": {}, "outputs": [], "source": [ "# Create a dictionary to map cell line to control embedding\n", "control_dict = {}\n", "for cell_line in agg_control.obs_names:\n", " control_dict[cell_line] = agg_control[agg_control.obs_names == cell_line].layers['mean']\n", "\n", "# Subtract control from drug for each cell line\n", "drug_minus_control = agg_drug.layers['mean'].copy()\n", "for i, (cell_line, _) in enumerate(zip(agg_drug.obs['cell_line'], agg_drug.obs['drugname_drugconc'])):\n", " # Find the control embedding for this cell line\n", " control_embedding = control_dict[cell_line]\n", " # Subtract control from drug\n", " drug_minus_control[i] = agg_drug.layers['mean'][i] - control_embedding\n", "\n", "# Store the result in agg_drug\n", "agg_drug.layers['drug_minus_control'] = drug_minus_control" ] }, { "cell_type": "code", "execution_count": 55, "id": "23f57c30", "metadata": {}, "outputs": [], "source": [ "drug_info_dict = dict(zip(edist[\"drug_info\"], edist[\"drug_name\"]))\n", "agg_drug.obs[\"drug\"] = agg_drug.obs[\"drugname_drugconc\"].str.strip('[]').map(drug_info_dict)" ] }, { "cell_type": "code", "execution_count": 63, "id": "26828179", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | 0 | \n", "1 | \n", "2 | \n", "3 | \n", "4 | \n", "5 | \n", "6 | \n", "7 | \n", "8 | \n", "9 | \n", "... | \n", "40 | \n", "41 | \n", "42 | \n", "43 | \n", "44 | \n", "45 | \n", "46 | \n", "47 | \n", "48 | \n", "49 | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
(R)-Verapamil (hydrochloride)_CVCL_0023 | \n", "-0.001189 | \n", "-0.000930 | \n", "-0.000356 | \n", "-0.000772 | \n", "0.001502 | \n", "-0.001483 | \n", "-0.000367 | \n", "-0.000646 | \n", "0.000231 | \n", "0.000054 | \n", "... | \n", "0.006757 | \n", "-0.000348 | \n", "-0.001312 | \n", "0.002497 | \n", "-0.001008 | \n", "0.002215 | \n", "-0.001303 | \n", "-0.000437 | \n", "-0.000089 | \n", "0.000248 | \n", "
(R)-Verapamil (hydrochloride)_CVCL_0023 | \n", "0.000509 | \n", "-0.000314 | \n", "-0.000501 | \n", "0.001635 | \n", "0.000267 | \n", "0.001530 | \n", "-0.000254 | \n", "0.000078 | \n", "-0.000296 | \n", "-0.001709 | \n", "... | \n", "-0.004816 | \n", "0.002798 | \n", "0.000306 | \n", "-0.001051 | \n", "0.000692 | \n", "-0.001758 | \n", "0.000546 | \n", "0.000159 | \n", "-0.000646 | \n", "0.000613 | \n", "
(S)-Crizotinib_CVCL_0023 | \n", "0.001051 | \n", "0.000752 | \n", "-0.000214 | \n", "0.001631 | \n", "-0.000497 | \n", "0.001123 | \n", "0.000194 | \n", "0.000487 | \n", "0.000163 | \n", "-0.000990 | \n", "... | \n", "-0.006705 | \n", "0.001446 | \n", "0.000833 | \n", "-0.002262 | \n", "0.001354 | \n", "-0.001338 | \n", "0.000030 | \n", "0.000791 | \n", "-0.000406 | \n", "-0.000211 | \n", "
(S)-Crizotinib_CVCL_0023 | \n", "0.000526 | \n", "0.000310 | \n", "-0.000936 | \n", "-0.000181 | \n", "-0.000224 | \n", "-0.000014 | \n", "0.000108 | \n", "0.000219 | \n", "0.000659 | \n", "0.000945 | \n", "... | \n", "-0.000703 | \n", "0.000048 | \n", "0.000071 | \n", "-0.001652 | \n", "0.000142 | \n", "0.000163 | \n", "-0.001194 | \n", "0.000606 | \n", "-0.000202 | \n", "-0.000306 | \n", "
18β-Glycyrrhetinic acid_CVCL_0023 | \n", "0.000612 | \n", "-0.000744 | \n", "-0.001400 | \n", "0.001152 | \n", "0.000551 | \n", "0.001453 | \n", "-0.000323 | \n", "0.000138 | \n", "-0.000099 | \n", "-0.001424 | \n", "... | \n", "-0.004113 | \n", "0.002303 | \n", "0.000311 | \n", "-0.002108 | \n", "0.000868 | \n", "-0.000326 | \n", "0.000812 | \n", "0.000273 | \n", "-0.000702 | \n", "0.000532 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
vincristine_CVCL_C466 | \n", "-0.000265 | \n", "0.000069 | \n", "0.000329 | \n", "-0.000427 | \n", "0.001357 | \n", "-0.000266 | \n", "-0.000137 | \n", "-0.000241 | \n", "-0.000239 | \n", "0.000142 | \n", "... | \n", "0.002062 | \n", "-0.001047 | \n", "-0.000634 | \n", "-0.000247 | \n", "-0.000422 | \n", "0.000455 | \n", "0.001756 | \n", "-0.000591 | \n", "0.000308 | \n", "-0.000390 | \n", "
vincristine_CVCL_C466 | \n", "0.001270 | \n", "0.000834 | \n", "0.001470 | \n", "0.000604 | \n", "-0.000746 | \n", "0.001983 | \n", "0.000357 | \n", "0.000831 | \n", "-0.001709 | \n", "-0.000370 | \n", "... | \n", "-0.009023 | \n", "0.001240 | \n", "0.000909 | \n", "-0.000490 | \n", "0.002720 | \n", "-0.003731 | \n", "0.001937 | \n", "0.000975 | \n", "0.000028 | \n", "0.000928 | \n", "
γ-Oryzanol_CVCL_C466 | \n", "0.000580 | \n", "0.000537 | \n", "-0.000426 | \n", "0.000732 | \n", "-0.001178 | \n", "0.002092 | \n", "-0.000386 | \n", "0.000015 | \n", "-0.000544 | \n", "0.000309 | \n", "... | \n", "-0.001335 | \n", "-0.000095 | \n", "0.000059 | \n", "0.000016 | \n", "0.000035 | \n", "-0.001453 | \n", "0.000773 | \n", "0.000624 | \n", "-0.000406 | \n", "0.000121 | \n", "
γ-Oryzanol_CVCL_C466 | \n", "0.000566 | \n", "0.000652 | \n", "-0.000245 | \n", "0.000373 | \n", "-0.001173 | \n", "0.002531 | \n", "-0.000274 | \n", "0.000059 | \n", "-0.000458 | \n", "0.000660 | \n", "... | \n", "-0.002606 | \n", "0.000012 | \n", "0.000019 | \n", "-0.000053 | \n", "-0.000455 | \n", "-0.002143 | \n", "0.001301 | \n", "0.000597 | \n", "-0.000375 | \n", "0.000082 | \n", "
γ-Oryzanol_CVCL_C466 | \n", "0.000395 | \n", "0.000591 | \n", "-0.000391 | \n", "0.000089 | \n", "-0.000803 | \n", "0.001497 | \n", "-0.000250 | \n", "0.000237 | \n", "-0.001100 | \n", "0.000208 | \n", "... | \n", "-0.001126 | \n", "0.000414 | \n", "0.000312 | \n", "-0.000449 | \n", "0.000261 | \n", "-0.001099 | \n", "-0.001314 | \n", "0.000568 | \n", "-0.000030 | \n", "0.000338 | \n", "
47846 rows × 50 columns
\n", "