Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 56 additions & 56 deletions main/COMO.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@
" raise FileNotFoundError(\"Root directory could not be determined; unable to find 'COMO.ipynb'\")\n",
" current_dir = current_dir.parent"
],
"id": "2656ec5201a33b9f"
"id": "f58450da8bd97732"
},
{
"metadata": {},
Expand Down Expand Up @@ -299,7 +299,7 @@
"mrna_metadata_filepath = Path(notebook_dir / \"data/config_sheets/mrna_config.xlsx\")\n",
"proteomics_metadata_filepath = Path(notebook_dir / \"data/config_sheets/proteomics_config.xlsx\")\n"
],
"id": "3aa04b2bf1798c20"
"id": "c1d957a21a4b5393"
},
{
"metadata": {},
Expand All @@ -311,7 +311,7 @@
"- `taxon_id`: The [NCBI Taxon ID](https://www.ncbi.nlm.nih.gov/taxonomy) to use\n",
"- `preprocess_mode`: This should be set to `\"create-matrix\"` if you are **not** providing a matrix, otherwise set it to `\"provide-matrix\"`"
],
"id": "11852d88cdc2ee32"
"id": "7d813235940a2e89"
},
{
"metadata": {},
Expand All @@ -322,7 +322,7 @@
"for context in context_names:\n",
" if context not in {*trna_matrix_filepath, *mrna_matrix_filepath}:\n",
" continue\n",
" await rnaseq_preprocess(\n",
" rnaseq_preprocess(\n",
" context_name=context,\n",
" taxon=taxon_id,\n",
" como_context_dir=como_context_dir[context],\n",
Expand All @@ -336,7 +336,7 @@
" log_level=\"INFO\",\n",
" )"
],
"id": "49a9206c10732797"
"id": "b2cb28bba57ed02f"
},
{
"metadata": {},
Expand Down Expand Up @@ -369,7 +369,7 @@
"\n",
"This method is not recommended, as zFPKM is much more robust for a similar level of \"hands-off\" model building\n"
],
"id": "addc4f6dd55a33f3"
"id": "b9227a4f747356f2"
},
{
"metadata": {},
Expand All @@ -390,7 +390,7 @@
"#### Single Cell RNA Sequencing\n",
"While the Snakemake pipeline does not yet support single-cell alignment, and COMO does not yet support automated configuration file and counts matrix file creation for single-cell alignment output from STAR, it is possible to use single-cell data to create a model with COMO. Because normalization strategies can be applied to single-cell data in the same way it is applied to bulk RNA sequencing, `como/rnaseq_gen.py` can be used with a provided counts matrix and configuration file, from [Step 1](Step-1:-Initialize-and-Preprocess-RNA-seq-data), above. Just like `\"total\"` and `\"mRNA\"`, `como/rnaseq_gen.py` can be executed with `\"SC\"` as the \"`--library-prep`\" argument to help COMO differentiate it from any bulk RNA sequencing data if multiple strategies are being used."
],
"id": "ed35195f4278ae5c"
"id": "4c960cd5ddcdd542"
},
{
"metadata": {},
Expand All @@ -409,7 +409,7 @@
"- `min_zfpkm`: The cutoff for Counts-Per-Million filtering\n",
"- `prep_method`: The library method used for preparation. Options are: `\"total\"`, `\"mRNA\"`, or `\"SC\"`,\n"
],
"id": "6dd118e06e9e4da0"
"id": "cbacf586116ce040"
},
{
"metadata": {},
Expand All @@ -429,15 +429,17 @@
"for context in context_names:\n",
" if context not in trna_matrix_filepath:\n",
" continue\n",
" output_zscore_norm_filepath = Path(get_notebook_dir() / f\"data/results/{context}/z_score_normalization.csv\")\n",
" await rnaseq_gen(\n",
" output_zscore_norm_filepath = Path(\n",
" get_notebook_dir() / \"data\" / \"results\" / context / \"trna_z_score_normalization.csv\"\n",
" )\n",
" rnaseq_gen(\n",
" context_name=context,\n",
" input_metadata_filepath_or_df=trna_metadata_filepath,\n",
" input_rnaseq_filepath=trna_matrix_filepath[context],\n",
" input_gene_info_filepath=gene_info_filepath[context],\n",
" output_boolean_activity_filepath=trna_matrix_filepath[context],\n",
" prep=RNAType.TRNA,\n",
" taxon_id=taxon_id,\n",
" input_metadata_filepath_or_df=trna_metadata_filepath,\n",
" replicate_ratio=replicate_ratio,\n",
" high_replicate_ratio=high_confidence_replicate_ratio,\n",
" batch_ratio=batch_ratio,\n",
Expand All @@ -447,7 +449,7 @@
" output_zscore_normalization_filepath=output_zscore_norm_filepath,\n",
" )"
],
"id": "bd15ec97dd0a38a8"
"id": "6f7e1634d7a912ba"
},
{
"metadata": {},
Expand All @@ -467,7 +469,7 @@
"- `min_zfpkm`: The cutoff for Counts-Per-Million filtering\n",
"- `prep_method`: The library method used for preparation. Options are: `\"total\"`, `\"mRNA\"`, or `\"SC\"`,\n"
],
"id": "71fd3eab25176aad"
"id": "ca2918f7d6e23e5c"
},
{
"metadata": {},
Expand All @@ -487,26 +489,27 @@
"for context in context_names:\n",
" if context not in mrna_matrix_filepath:\n",
" continue\n",
" await rnaseq_gen(\n",
" rnaseq_gen(\n",
" context_name=context,\n",
" input_metadata_filepath_or_df=mrna_metadata_filepath,\n",
" input_rnaseq_filepath=mrna_matrix_filepath[context],\n",
" input_gene_info_filepath=gene_info_filepath[context],\n",
" output_boolean_activity_filepath=mrna_matrix_filepath[context],\n",
" prep=RNAType.MRNA,\n",
" taxon_id=taxon_id,\n",
" input_metadata_filepath=mrna_metadata_filepath,\n",
" replicate_ratio=replicate_ratio,\n",
" high_replicate_ratio=high_confidence_replicate_ratio,\n",
" batch_ratio=batch_ratio,\n",
" high_batch_ratio=high_confidence_batch_ratio,\n",
" technique=technique,\n",
" cutoff=cutoff,\n",
" output_zscore_normalization_filepath=Path(get_notebook_dir(),\n",
" f\"data/results/{context}/z_score_normalization.csv\"),\n",
" output_zscore_normalization_filepath=Path(\n",
" get_notebook_dir(),\n",
" f\"data/results/{context}/z_score_normalization.csv\"\n",
" ),\n",
" )"
],
"id": "925f939b1f318673"
"id": "af4293b08391ad2f"
},
{
"metadata": {},
Expand All @@ -526,7 +529,7 @@
"- `min_zfpkm`: The cutoff for Counts-Per-Million filtering\n",
"- `prep_method`: The library method used for preparation. Options are: `\"total\"`, `\"mRNA\"`, or `\"scrna\"`,\n"
],
"id": "ddc6b8d634feaacc"
"id": "24d6510e09b88018"
},
{
"metadata": {},
Expand All @@ -546,24 +549,27 @@
"for context in context_names:\n",
" if context not in scrna_matrix_filepath:\n",
" continue\n",
" await rnaseq_gen(\n",
" output_zscore_norm_filepath = Path(\n",
" get_notebook_dir() / \"data\" / \"results\" / context / \"mrna_zscore_normalization.csv\"\n",
" )\n",
" rnaseq_gen(\n",
" context_name=context,\n",
" input_metadata_filepath_or_df=mrna_metadata_filepath,\n",
" input_rnaseq_filepath=scrna_matrix_filepath[context],\n",
" input_gene_info_filepath=gene_info_filepath[context],\n",
" output_boolean_activity_filepath=scrna_matrix_filepath[context],\n",
" prep=RNAType.SCRNA,\n",
" taxon_id=taxon_id,\n",
" input_metadata_filepath=Path(\"./data/config_sheets/scrna_config.xlsx\"),\n",
" replicate_ratio=replicate_ratio,\n",
" high_replicate_ratio=high_confidence_replicate_ratio,\n",
" batch_ratio=batch_ratio,\n",
" high_batch_ratio=high_confidence_batch_ratio,\n",
" technique=technique,\n",
" cutoff=cutoff,\n",
" output_zscore_normalization_filepath=None,\n",
" output_zscore_normalization_filepath=output_zscore_norm_filepath\n",
" )"
],
"id": "ff137d18eed6995b"
"id": "ef57ea1b08c1b121"
},
{
"metadata": {},
Expand All @@ -580,7 +586,7 @@
"- `high_batch_ratio`: The ratio required before a gene is considered \"high-confidence\" in the study\n",
"- `quantile`: The cutoff Transcripts-Per-Million quantile for filtering"
],
"id": "8ca2a08af58c517d"
"id": "c2c5cc7eb9d2e44f"
},
{
"metadata": {},
Expand All @@ -592,7 +598,7 @@
"\n",
"for context in context_names:\n",
" await proteomics_gen(\n",
" context_name=context_names,\n",
" context_name=context,\n",
" config_filepath=proteomics_metadata_filepath,\n",
" matrix_filepath=proteomics_matrix_filepath[context],\n",
" output_boolean_filepath=Path(f\"data/results/{context}/proteomics/{context}_proteomics_boolean_matrix.csv\"),\n",
Expand All @@ -607,7 +613,7 @@
" quantile=25,\n",
" )"
],
"id": "2124f206489b1002"
"id": "ca39fe2e0744a401"
},
{
"metadata": {},
Expand All @@ -633,7 +639,7 @@
"- `n_neighbors_context`: N nearest neighbors for context clustering. The default is `\"default\"`, which is the total number of contexts\n",
"- `seed`: The random seed for clustering algorithm initialization. If not specified, `np.random.randint(0, 100000)` is used"
],
"id": "50c625b676e3d643"
"id": "6a7f40910eefa1cc"
},
{
"metadata": {},
Expand Down Expand Up @@ -677,7 +683,7 @@
"\n",
"!{cmd}"
],
"id": "9f1a7a85673fc17"
"id": "1728cd710f834c2f"
},
{
"metadata": {},
Expand Down Expand Up @@ -717,7 +723,7 @@
"\n",
"Each of the \"weights\" (`total_rna_weight`, `mrna_weight`, etc.) are used to place a significance on each method. Becuase there are many steps in the Dogma from transcription to translation, the gene expression as seen by total RNA or mRNA sequencing may not be representative of the gene's protein expression, and this its metabolic impact. Because of this, you are able to weight each source more (or less) than another."
],
"id": "5bb309f8d441ddcf"
"id": "db5681f5e8c4111e"
},
{
"metadata": {},
Expand Down Expand Up @@ -746,16 +752,13 @@
"trna_batches = {\"naiveB\": [\"naiveB_S3R1\", \"naiveB_S3R2\", \"naiveB_S3R3\"]}\n",
"\n",
"for context in context_names:\n",
" await merge_xomics(\n",
" merge_xomics(\n",
" context_name=context,\n",
" taxon_id=taxon_id,\n",
" trna_matrix_or_filepath=trna_matrix_filepath[context],\n",
" mrna_matrix_or_filepath=mrna_matrix_filepath[context],\n",
" scrna_matrix_or_filepath=None, # scrna_matrix_filepath[context],\n",
" proteomic_matrix_or_filepath=None, # proteomics_matrix_filepath[context],\n",
" trna_batches=trna_batches,\n",
" mrna_batches=mrna_batches,\n",
" scrna_batches=None,\n",
" proteomic_batches=None,\n",
" trna_weight=total_rna_weight,\n",
" mrna_weight=mrna_weight,\n",
" scrna_weight=single_cell_weight,\n",
Expand All @@ -765,10 +768,7 @@
" adjust_method=requirement_adjustment_method,\n",
" force_activate_high_confidence=force_activate_high_confidence,\n",
" adjust_for_na=adjust_for_na_sources,\n",
" merge_zfpkm_distribution=merge_zfpkm_distrubution,\n",
" keep_transcriptomics_score=keep_transcriptomics_score,\n",
" output_merge_activity_filepath=Path(f\"data/results/{context}/ActiveGenes_{context}_Merged.csv\"),\n",
" output_transcriptomic_details_filepath=Path(f\"data/results/{context}/TranscriptomicDetails_{context}.csv\"),\n",
" output_trna_activity_filepath=Path(f\"data/results/{context}/total-rna/trna_activity_{context}.csv\"),\n",
" output_mrna_activity_filepath=Path(f\"data/results/{context}/mrna/mrna_activity_{context}.csv\"),\n",
" output_scrna_activity_filepath=Path(f\"data/results/{context}/scrna/scrna_activity_{context}.csv\"),\n",
Expand All @@ -777,7 +777,7 @@
" output_figure_dirpath=Path(f\"data/results/{context}/figures\")\n",
" )"
],
"id": "5cf72339439acf79"
"id": "c48d8b8d6ab11e16"
},
{
"metadata": {},
Expand Down Expand Up @@ -844,7 +844,7 @@
"- `force_reactions_filename`: The filename of the force reactions to be used. Force reactions will (as the name implies) force the optimizer to use these reactions, **no matter their expression**\n",
"- `exclude_reactions_filename`: The filename of reactions to exclude from the model, no matter their expression"
],
"id": "691335a66e36ee7c"
"id": "e80c7864a129ea83"
},
{
"metadata": {},
Expand All @@ -870,31 +870,31 @@
"recon_algorithms = [\"IMAT\"]\n",
"solver = \"GUROBI\"\n",
"\n",
"config = Config()\n",
"\n",
"for recon_alg in recon_algorithms:\n",
" for context in context_names:\n",
" await create_context_specific_model(\n",
" create_context_specific_model(\n",
" context_name=context,\n",
" reference_model=Path(\n",
" \"/Users/satominakamura/Desktop/Dr.Helikar Lab/COMO/main/data/GeneralModelUpdatedV2.mat\"),\n",
" taxon_id=taxon_id,\n",
" reference_model_filepath=Path(\n",
" get_notebook_dir() / \"data\" / \"reference_models\" / \"GeneralModelUpdatedV3.json\"\n",
" ),\n",
" active_genes_filepath=Path(f\"{notebook_dir}/data/results/{context}/ActiveGenes_{context}_Merged.csv\"),\n",
" output_infeasible_reactions_filepath=Path(\n",
" f\"{notebook_dir}/data/results/{context}/infeasible_reactions_{context}.csv\"),\n",
" f\"{notebook_dir}/data/results/{context}/infeasible_reactions_{context}.csv\"\n",
" ),\n",
" output_flux_result_filepath=Path(f\"{notebook_dir}/data/results/{context}/FluxResults_{context}.csv\"),\n",
" output_model_filepaths=Path(\n",
" f\"{notebook_dir}/data/results/{context}/{context}_{recon_alg}_model.json\"),\n",
" f\"{notebook_dir}/data/results/{context}/{context}_{recon_alg}_model.json\"\n",
" ),\n",
" objective=\"biomass_maintenance\",\n",
" boundary_rxns_filepath=Path(f\"{notebook_dir}/data/boundary_rxns/{context}_boundary_rxns.csv\"),\n",
" exclude_rxns_filepath=Path(f\"{notebook_dir}/data/exclude_rxns/{context}_exclude_rxns.csv\"),\n",
" force_rxns_filepath=Path(f\"{notebook_dir}/data/force_rxns/{context}_force_rxns.csv\"),\n",
" algorithm=Algorithm.IMAT,\n",
" solver=Solver.GUROBI,\n",
" )\n",
" # fmt: on\n",
" !{cmd}"
" )"
],
"id": "961737f5e356c2b6"
"id": "4c9747c67bc80e88"
},
{
"metadata": {},
Expand All @@ -919,7 +919,7 @@
"- `exampleTissue`: This is the name of the tissue context\n",
"- `ALGORITHM`: This is the algorithm (`recon_algorithm`) used in the above model creation step\n"
],
"id": "d5ce2da2d3b27868"
"id": "6e60a22a03b54803"
},
{
"metadata": {},
Expand Down Expand Up @@ -997,7 +997,7 @@
"\n",
" !{cmd}"
],
"id": "a8ed4f4f6f10fec8"
"id": "14d4ca54309049b0"
},
{
"metadata": {},
Expand All @@ -1016,7 +1016,7 @@
"- `data_source`: The datasource you are using for disease analysis. This should be`\"rnaseq\"`\n",
"- `taxon_id`: The [NCBI Taxon ID](https://www.ncbi.nlm.nih.gov/taxonomy) to use for disease analysis"
],
"id": "84dd6096f89ad000"
"id": "51cd2ec29bbbd4a0"
},
{
"metadata": {},
Expand Down Expand Up @@ -1047,7 +1047,7 @@
"\n",
" !{cmd}"
],
"id": "6daddfb1edac03ab"
"id": "ead8911867f0e352"
},
{
"metadata": {},
Expand Down Expand Up @@ -1082,7 +1082,7 @@
"\n",
"- `solver`: The solver you would like to use. Available options are `\"gurobi\"` or `\"glpk\"`\n"
],
"id": "645d829a6b3266cd"
"id": "a1ba8eca6e7c6991"
},
{
"metadata": {},
Expand Down Expand Up @@ -1153,7 +1153,7 @@
" cmd = \" \".join(cmd)\n",
" !{cmd}"
],
"id": "9c1731579d3711b2"
"id": "fcdf3a87b72665a6"
}
],
"metadata": {},
Expand Down
2 changes: 1 addition & 1 deletion main/como/proteomics_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ def load_empty_dict():
)
return load_empty_dict()


# TODO: Convert to synchronous function
async def proteomics_gen(
context_name: str,
config_filepath: Path,
Expand Down