openproblems-bio · dariarom94 · Oct 23, 2025 · Oct 28, 2025 · Oct 29, 2025 · LouisK92
diff --git a/src/methods_transcript_assignment/fastreseg/config.vsh.yaml b/src/methods_transcript_assignment/fastreseg/config.vsh.yaml
@@ -0,0 +1,81 @@
+__merge__: /src/api/comp_method_transcript_assignment.yaml
+
+name: fastreseg
+label: "fastReseg transcript assignment"
+summary: "Spatial segmentation correction using fastReseg method"
+description: |
+  fastReseg is an R package designed to enhance the precision of cell segmentation in spatial transcriptomics by 
+  leveraging transcriptomic data to correct and refine initial image-based segmentation results.
+links:
+  documentation: "https://github.com/openproblems-bio/task_ist_preprocessing"
+  repository: "https://github.com/openproblems-bio/task_ist_preprocessing"
+references:
+  doi: "10.1038/s41598-025-08733-5"
+
+
+arguments:
+  - name: --transcripts_key
+    type: string
+    default: "transcripts"
+    description: "Key for transcripts in the points layer"
+  - name: --coordinate_system
+    type: string
+    default: "global"
+    description: "Coordinate system for the transcripts"
+
+
+resources:
+  - type: bash_script
+    path: orchestrator.sh
+  - type: python_script
+    path: input.py
+  - type: file
+    path: fastreseg.yml
+    dest: environment.yml
+  - type: r_script
+    path: script.R
+  - type: python_script
+    path: output.py
+
+engines:
+  - type: docker
+    image: openproblems/base_python:1
+    setup:
+      - type: apt
+        packages:
+          - libv8-dev
+          - libudunits2-dev
+          - libabsl-dev
+          - gdal-bin
+          - libgdal-dev
+          - r-base
+      - type: r
+        packages:
+          - devtools
+          - terra
+          - Matrix
+          - dbscan
+          - igraph
+          - matrixStats
+          - codetools
+          - data.table
+          # Add other CRAN packages here
+        github:
+          # Add GitHub packages here if needed
+          - Nanostring-Biostats/FastReseg
+      #- type: python
+      #  github:
+      #    - openproblems-bio/core#subdirectory=packages/python/openproblems
+    __merge__: 
+      - /src/base/setup_txsim_partial.yaml
+      - /src/base/setup_spatialdata_partial.yaml
+  - type: native
+
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [ midtime, midcpu, midmem ]
+
+
+##macOS workaround: export DOCKER_DEFAULT_PLATFORM=linux/amd64
diff --git a/src/methods_transcript_assignment/fastreseg/fastreseg.yml b/src/methods_transcript_assignment/fastreseg/fastreseg.yml
@@ -0,0 +1,13 @@
+name: fastreseg
+channels:
+  - anaconda
+  - conda-forge
+  - defaults
+dependencies:
+  - r-base=4.5.1
+  - r-devtools
+  - pip
+  - ipykernel
+  - jupyter_client
+  - r-irkernel
+prefix: /opt/miniconda3/envs/fastreseg
diff --git a/src/methods_transcript_assignment/fastreseg/input.py b/src/methods_transcript_assignment/fastreseg/input.py
@@ -0,0 +1,116 @@
+import spatialdata as sd
+from tifffile import imwrite  
+import sys
+import numpy as np
+import pandas as pd
+import xarray as xr
+import dask
+import txsim as tx
+import anndata as ad
+import argparse
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(
+        description="Process input files and generate output files for FastReseg input"
+    )
+    parser.add_argument('input_path_ist', help='Path to the input file')
+    parser.add_argument('input_segmentation_path', help='Path to the input segmentation file')
+    parser.add_argument('input_sc_reference_path', help='Path to the input single-cell reference file')
+    parser.add_argument('output_path_counts', help='Path for the output TSV file')
+    parser.add_argument('output_path_transcripts', help='Path for the output TIF file')
+    parser.add_argument('output_path_cell_type', help='Output cell type specification')
+
+    return parser.parse_args()
+
+### parsing arguments
+args = parse_arguments()
+print("args:")
+print(args)
+input_path = args.input_path_ist
+input_segmentation_path = args.input_segmentation_path
+input_sc_reference_path = args.input_sc_reference_path
+print("path")
+print(input_sc_reference_path)
+output_path_counts = args.output_path_counts
+output_path_transcripts = args.output_path_transcripts
+output_path_cell_type = args.output_path_cell_type
+
+## potential other parameters (TODO - make configurable)
+um_per_pixel = 0.5
+sc_celltype_key = 'cell_type'
+
+### reading the data in
+sdata = sd.read_zarr(input_path)
+
+### reading in basic segmentation
+sdata_segm = sd.read_zarr(input_segmentation_path)
+segmentation_coord_systems = sd.transformations.get_transformation(sdata_segm["segmentation"], get_all=True).keys()
+
+# In case of a translation transformation of the segmentation (e.g. crop of the data), we need to adjust the transcript coordinates
+trans = sd.transformations.get_transformation(sdata_segm["segmentation"], get_all=True)['global'].inverse()
+
+transcripts = sd.transform(sdata['transcripts'], to_coordinate_system='global')
+transcripts = sd.transform(transcripts, trans, 'global')
+
+print('Assigning transcripts to cell ids', flush=True)
+y_coords = transcripts.y.compute().to_numpy(dtype=np.int64)
+x_coords = transcripts.x.compute().to_numpy(dtype=np.int64)
+if isinstance(sdata_segm["segmentation"], xr.DataTree):
+    label_image = sdata_segm["segmentation"]["scale0"].image.to_numpy() 
+else:
+    label_image = sdata_segm["segmentation"].to_numpy()
+cell_id_dask_series = dask.dataframe.from_dask_array(
+    dask.array.from_array(
+        label_image[y_coords, x_coords], chunks=tuple(sdata['transcripts'].map_partitions(len).compute())
+    ), 
+    index=sdata['transcripts'].index
+)
+sdata['transcripts']["cell_id"] = cell_id_dask_series
+
+### extracting transcript ids
+print('Transforming transcripts coordinates', flush=True)
+transcripts = sd.transform(sdata['transcripts'], to_coordinate_system='global')
+
+transcripts_df = transcripts.compute()
+transcripts_df.rename(columns = {'feature_name': 'target', 
+'transcript_id': 'UMI_transID', 'cell_id': 'UMI_cellID'}, inplace = True)
+
+transcripts_df = transcripts_df.loc[:, ['target', 'x', 'y', 'z', 'UMI_transID', 'UMI_cellID']]
+transcripts_df.to_csv(output_path_transcripts)
+
+
+#### aggregating counts per transcript, based on 
+df = sdata['transcripts'].compute()
+df.feature_name = df.feature_name.astype(str)
+
+adata_sp = tx.preprocessing.generate_adata(df, cell_id_col='cell_id', gene_col='feature_name') #TODO: x and y refers to a specific coordinate system. Decide which space we want to use here. (probably should be handled in the previous assignment step)
+adata_sp.layers['counts'] = adata_sp.layers['raw_counts']
+del adata_sp.layers['raw_counts']
+adata_sp.var["gene_name"] = adata_sp.var_names
+print(adata_sp.var_names[1:10])
+
+# currently the function also saves the transcripts in the adata object, but this is not necessary here
+del adata_sp.uns['spots']
+del adata_sp.uns['pct_noise']
+
+
+count_df = pd.DataFrame(adata_sp.X.toarray(), 
+                       index=adata_sp.obs_names, 
+                       columns=adata_sp.var_names)
+count_df.to_csv(output_path_counts)
+
+#### run cell annotation with ssam
+adata_sc = ad.read_h5ad(input_sc_reference_path)
+adata_sc.X = adata_sc.layers["normalized"]
+print(adata_sc.var_names[1:10])
+
+shared_genes = [g for g in adata_sc.var_names if g in adata_sp.var_names]
+adata_sp = adata_sp[:,shared_genes] 
+
+print('Annotating cell types', flush=True)
+adata_sp = tx.preprocessing.run_ssam(
+    adata_sp, transcripts.compute(), adata_sc, um_p_px=um_per_pixel, 
+    cell_id_col='cell_id', gene_col='feature_name', sc_ct_key=sc_celltype_key
+)
+cell_type_df = adata_sp.obs["ct_ssam"].astype(str)
+cell_type_df.to_csv(output_path_cell_type, header=True)
diff --git a/src/methods_transcript_assignment/fastreseg/orchestrator.sh b/src/methods_transcript_assignment/fastreseg/orchestrator.sh
@@ -0,0 +1,88 @@
+#!/bin/bash
+
+## VIASH START
+# The following code has been auto-generated by Viash.
+par_input_ist='resources_test/task_ist_preprocessing/mouse_brain_combined/raw_ist.zarr'
+par_input_segmentation='resources_test/task_ist_preprocessing/mouse_brain_combined/segmentation.zarr'
+par_input_scrnaseq='resources_test/task_ist_preprocessing/mouse_brain_combined/scrnaseq_reference.h5ad'
+par_sc_cell_type_key='cell_type'
+par_output='resources_test/task_ist_preprocessing/mouse_brain_combined/transcript_assignments.zarr'
+par_transcripts_key='transcripts'
+par_coordinate_system='global'
+meta_name='fastreseg'
+meta_functionality_name='fastreseg'
+meta_resources_dir='/private/tmp/viash_inject_fastreseg18170326436127140412'
+meta_executable='/private/tmp/viash_inject_fastreseg18170326436127140412/fastreseg'
+meta_config='/private/tmp/viash_inject_fastreseg18170326436127140412/.config.vsh.yaml'
+meta_temp_dir='/var/folders/fq/ymt0vml175s4yvqxzbmlmpz80000gn/T/'
+meta_cpus='123'
+meta_memory_b='123'
+meta_memory_kb='123'
+meta_memory_mb='123'
+meta_memory_gb='123'
+meta_memory_tb='123'
+meta_memory_pb='123'
+meta_memory_kib='123'
+meta_memory_mib='123'
+meta_memory_gib='123'
+meta_memory_tib='123'
+meta_memory_pib='123'
+
+## VIASH END
+
+par_intermediate_dir=$(mktemp -d -p "$(pwd)" tmp-processing-XXXXXXXX)
+# Access the YAML file
+CONDA_ENV_FILE="$meta_resources_dir/environment.yml"
+echo $CONDA_ENV_FILE
+echo "running FastReseg orchestrator"
+
+# Create intermediate directory
+mkdir -p "$par_intermediate_dir"
+echo $(date +%T)
+
+# Step 1: Run Python script to reformat input in the first Python environment
+python "input.py" \
+  "$par_input_ist" \
+  "$par_input_segmentation" \
+  "$par_input_scrnaseq" \
+  "$par_intermediate_dir/counts.tsv" \
+  "$par_intermediate_dir/transcripts.tsv" \
+  "$par_intermediate_dir/cell_types.tsv"
+
+head $par_intermediate_dir/cell_types.tsv
+
+# Step 2: RunFastReseg
+## making conda environment with r-base and fastReseg
+#export CONDA_DIR=/opt/conda
+#wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh && \
+#    /bin/bash ~/miniconda.sh -b -p /opt/conda
+# Put conda in path so we can use conda activate
+#export PATH=$CONDA_DIR/bin:$PATH
+
+# Create and activate the second Python environment
+# Initialize conda for bash
+#eval "$(/opt/conda/bin/conda shell.bash hook)"
+#conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main
+#conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r
+#conda env create -f $CONDA_ENV_FILE
+
+#conda activate fastreseg
+#conda install conda-forge::r-concaveman conda-forge::r-data.table conda-forge::r-ggplot2 conda-forge::r-devtools conda-forge::r-terra
+#conda install conda-forge::r-codetools conda-forge::r-matrix conda-forge::r-dbscan conda-forge::r-igraph  conda-forge::r-matrixstats
+
+##running the R script
+Rscript script.R "$par_intermediate_dir/counts.tsv" \
+  "$par_intermediate_dir/transcripts.tsv" \
+  "$par_intermediate_dir/cell_types.tsv" \
+  "$par_intermediate_dir/cell_ids.csv" \
+ "$par_intermediate_dir/gene_names.csv" \
+  "$par_intermediate_dir/transcripts_out.csv"
+
+## python output
+python "output.py" \
+  "$par_intermediate_dir/cell_ids.csv" \
+  "$par_intermediate_dir/gene_names.csv" \
+  "$par_intermediate_dir/transcripts_out.csv" \
+  "$par_output"
+
+echo $(date +%T)
diff --git a/src/methods_transcript_assignment/fastreseg/output.py b/src/methods_transcript_assignment/fastreseg/output.py
@@ -0,0 +1,54 @@
+import spatialdata as sd
+from tifffile import imwrite
+import sys
+import numpy as np
+import pandas as pd
+import xarray as xr
+from scipy.sparse import coo_matrix
+from pathlib import Path
+import os
+import dask
+import anndata as ad
+
+def convert_to_lower_dtype(arr):
+    max_val = arr.max()
+    if max_val <= np.iinfo(np.uint8).max:
+        new_dtype = np.uint8
+    elif max_val <= np.iinfo(np.uint16).max:
+        new_dtype = np.uint16
+    elif max_val <= np.iinfo(np.uint32).max:
+        new_dtype = np.uint32
+    else:
+        new_dtype = np.uint64
+
+    return arr.astype(new_dtype)
+
+path_to_cell_ids_out = sys.argv[1]
+path_to_gene_names_out = sys.argv[2] 
+path_to_transcripts_out = sys.argv[3]
+
+output_zarr_path = sys.argv[4]
+
+cell_df = pd.read_csv(path_to_cell_ids_out, index_col=0)
+var_names = pd.read_csv(path_to_gene_names_out, index_col=0)
+df = pd.read_csv(path_to_transcripts_out, index_col=0)
+print(df.head())
+
+##converting to dask transcript output for spatialdata format
+df = df.loc[:,["x", "y", "z", "target", "updated_cellID", "updated_celltype", "UMI_transID"]]
+df.rename(columns={"updated_cellID": "cell_id", "target": "feature_name", "UMI_transID": "transcript_id"}, inplace=True)
+transcripts_dask = dask.dataframe.from_pandas(df, npartitions = 1)
+
+
+sdata_transcripts_only = sd.SpatialData(
+    points={
+        "transcripts": sd.models.PointsModel.parse(transcripts_dask)
+    },
+    tables={
+        "table": ad.AnnData(
+          obs=cell_df.loc[:,['updated_cellID', 'updated_celltype']],
+          var=pd.DataFrame(index=var_names['x'])
+        )
+    }
+)
+sdata_transcripts_only.write(output_zarr_path)