Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions main/como/combine_distributions.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,7 @@
_OutputCombinedSourceFilepath,
_SourceWeights,
)
from como.pipelines.identifier import convert
from como.utils import LogLevel, get_missing_gene_data, log_and_raise_error, num_columns
from como.pipelines.identifier import get_remaining_identifiers
from como.utils import num_columns


Expand Down Expand Up @@ -287,7 +286,7 @@ async def _begin_combining_distributions(
matrix_subset = matrix_subset.set_index(keys=[GeneIdentifier.entrez_gene_id.value], drop=True)
matrix_subset = matrix_subset.drop(columns=["gene_symbol", "ensembl_gene_id"], errors="ignore")
elif isinstance(matrix, sc.AnnData):
conversion = convert(ids=matrix.var_names.tolist(), taxon=taxon)
conversion = get_remaining_identifiers(ids=matrix.var_names.tolist(), taxon=taxon)
conversion.reset_index(drop=False, inplace=True)
matrix: pd.DataFrame = matrix.to_df().T
matrix.reset_index(inplace=True, drop=False, names=["gene_symbol"])
Expand Down
2 changes: 1 addition & 1 deletion main/como/merge_xomics.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
_SourceWeights,
)
from como.project import Config
from como.utils import get_missing_gene_data, log_and_raise_error, read_file, return_placeholder_data, set_up_logging
from como.utils import get_missing_gene_data, read_file, return_placeholder_data, set_up_logging


class _MergedHeaderNames:
Expand Down
212 changes: 161 additions & 51 deletions main/como/pipelines/identifier.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,56 @@
from collections.abc import Sequence
from typing import Literal
from collections.abc import Iterable, Iterator, Sequence
from typing import Any, Literal, overload

import pandas as pd
from bioservices.mygeneinfo import MyGeneInfo
from tqdm import tqdm

__all__ = [
"convert",
"build_gene_info",
"contains_identical_gene_types",
"determine_gene_type",
"get_remaining_identifiers",
]

T_IDS = int | str | Iterable[int] | Iterable[str] | Iterable[int | str]
T_MG_SCOPE = Literal["entrezgene", "ensembl.gene", "symbol"]
T_MG_TRANSLATE = Literal["entrez_gene_id", "ensembl_gene_id", "gene_symbol"]
T_MG_RETURN = list[dict[T_MG_TRANSLATE, str]]


def _get_conversion(info: MyGeneInfo, values: list[str], scope: T_MG_SCOPE, fields: str, taxon: str) -> T_MG_RETURN:
value_str = ",".join(map(str, values))
results = info.get_queries(query=value_str, dotfield=True, scopes=scope, fields=fields, species=taxon)
if not isinstance(results, list):
raise TypeError(f"Expected results to be a list, but got {type(results)}")
if not isinstance(results[0], dict):
raise TypeError(f"Expected each result to be a dict, but got {type(results[0])}")

data: T_MG_RETURN = []
for result in results:
ensembl = result.get("query" if scope == "ensembl.gene" else "ensembl.gene")
entrez = result.get("query" if scope == "entrezgene" else "entrezgene")
symbol = result.get("query" if scope == "symbol" else "symbol")
data.append({"ensembl_gene_id": ensembl, "entrez_gene_id": entrez, "gene_symbol": symbol})
def _get_conversion(info: MyGeneInfo, values: T_IDS, taxon: str | int) -> list[dict[str, Any]]:
value_list = sorted(map(str, [values] if isinstance(values, (int, str)) else values))
data_type = determine_gene_type(value_list)
if not all(v == data_type[value_list[0]] for v in data_type.values()):
raise ValueError("All items in ids must be of the same type (Entrez, Ensembl, or symbols).")

chunk_size = 1000
taxon_str = str(taxon)
scope: T_MG_SCOPE = next(iter(data_type.values()))
data = []
chunks = range(0, len(value_list), chunk_size)

for i in tqdm(chunks, desc=f"Getting info for '{scope}'"):
result = info.get_queries(
query=",".join(map(str, value_list[i : i + chunk_size])),
dotfield=True,
scopes=scope,
fields="ensembl.gene,entrezgene,symbol,genomic_pos.start,genomic_pos.end,taxid,notfound",
species=taxon_str,
)
if isinstance(result, int) and result == 414:
raise ValueError(
f"Query too long. Reduce the number of IDs in each query chunk (current chunk size: {chunk_size})."
)
if not isinstance(result, list):
raise TypeError(f"Expected results to be a list, but got {type(result)}")
if not isinstance(result[0], dict):
raise TypeError(f"Expected each result to be a dict, but got {type(result[0])}")
data.extend(result)
return data


def convert(ids: int | str | Sequence[int] | Sequence[str] | Sequence[int | str], taxon: int | str, cache: bool = True):
def get_remaining_identifiers(ids: T_IDS, taxon: int | str, cache: bool = True):
"""Convert between genomic identifiers.

This function will convert between the following components:
Expand All @@ -46,33 +65,111 @@ def convert(ids: int | str | Sequence[int] | Sequence[str] | Sequence[int | str]
:return: DataFrame with columns "entrez_gene_id", "ensembl_gene_id", and "gene_symbol"
"""
my_geneinfo = MyGeneInfo(cache=cache)
chunk_size = 1000
id_list = list(map(str, [ids] if isinstance(ids, (int, str)) else ids))
chunks = list(range(0, len(id_list), chunk_size))
gene_data = _get_conversion(info=my_geneinfo, values=ids, taxon=taxon)
df = (
pd.json_normalize(gene_data)
.rename(
columns={
"ensembl.gene": "ensembl_gene_id",
"entrezgene": "entrez_gene_id",
"symbol": "gene_symbol",
"taxid": "taxon_id",
}
)
.drop(
columns=["query", "_id", "_score", "genomic_pos.end", "genomic_pos.start"],
errors="ignore",
)
)
df = df[df["taxon_id"] == taxon]
df["taxon_id"] = df["taxon_id"].astype(int, copy=True)

# BUG: For an unknown reason, some Ensembl IDs are actually Entrez IDs
# To filter these, two approaches can be done:
# 1) Remove rows where Ensembl IDs are integers
# 2) Remove rows where Ensembl IDs equal Entrez IDs
# We are selecting option 1 because it goes for the root cause: Ensembl IDs are not pure integers
mask = df["ensembl_gene_id"].astype(str).str.fullmatch(r"\d+").fillna(False)
df = df[
(df["ensembl_gene_id"].astype("string").notna()) # remove NA values
& (~df["ensembl_gene_id"].astype("string").str.fullmatch(r"\d+")) # remove Entrez IDs
]
return df


def _to_scalar(val) -> int:
"""Calculate the distance between end (e) and start (s)."""
if isinstance(val, list):
return int(sum(val) / len(val)) if val else 0 # `if val` checks that the list contains items
if pd.isna(val):
return 0
return int(val)


def build_gene_info(ids: T_IDS, taxon: int | str, cache: bool = True):
"""Get genomic information from a given set of IDs.

The input should be of the same type, otherwise this function will fail.
Expected types are:
- Ensembl Gene ID
- Entrez Gene ID
- Gene Symbol

data_type = determine_gene_type(id_list)
if not all(v == data_type[id_list[0]] for v in data_type.values()):
raise ValueError("All items in ids must be of the same type (Entrez, Ensembl, or symbols).")
The returned data frame will have the following columns:
- ensembl_gene_id
- entrez_gene_id
- gene_symbol
- size (distance between genomic end and start)

scope = next(iter(data_type.values()))
fields = ",".join({"ensembl.gene", "entrezgene", "symbol"} - {scope})
taxon_str = str(taxon)
return pd.DataFrame(
[
row
for i in chunks
for row in _get_conversion(
info=my_geneinfo,
values=id_list[i : i + chunk_size],
scope=scope,
fields=fields,
taxon=taxon_str,
)
]
:param ids: IDs to be converted
:param taxon: Taxonomic identifier
:param cache: Should local caching be used for queries
:return: pandas.DataFrame
"""
my_geneinfo = MyGeneInfo(cache=cache)
gene_data = _get_conversion(info=my_geneinfo, values=ids, taxon=taxon)
df = pd.json_normalize(gene_data).rename(columns={"taxid": "taxon_id"})
df = df[df["taxon_id"] == taxon]
df["taxon_id"] = df["taxon_id"].astype(int, copy=True)

df["size"] = df["genomic_pos.end"].fillna(0).map(_to_scalar) - df["genomic_pos.start"].fillna(0).map(_to_scalar)
df = (
df[~(df["size"] == 0)]
.drop(
columns=[
"query",
"_id",
"_score",
"genomic_pos.start",
"genomic_pos.end",
"notfound",
],
inplace=False,
errors="ignore",
)
.rename(
columns={
"ensembl.gene": "ensembl_gene_id",
"entrezgene": "entrez_gene_id",
"symbol": "gene_symbol",
}
)
.explode(column=["ensembl_gene_id"])
.sort_values(by="ensembl_gene_id", inplace=False)
)

return df


@overload
def determine_gene_type(items: str, /) -> T_MG_SCOPE: ...


@overload
def determine_gene_type(items: Sequence[str], /) -> dict[str, T_MG_SCOPE]: ...

def determine_gene_type(items: str | list[str], /) -> dict[str, T_MG_SCOPE]:

def determine_gene_type(items: str | Sequence[str], /) -> str | dict[str, T_MG_SCOPE]:
"""Determine the genomic data type.

:param items: A string or list of strings representing gene identifiers.
Expand All @@ -85,16 +182,29 @@ def determine_gene_type(items: str | list[str], /) -> dict[str, T_MG_SCOPE]:
followed by a specific format (length greater than 11 and the last 11 characters are digits).
- "gene_symbol": If the item does not match the above criteria, it is assumed to be a gene symbol.
"""
items = [items] if isinstance(items, str) else items

determine: dict[str, Literal["entrezgene", "ensembl.gene", "symbol"]] = {}
for i in items:
i_str = str(i).split(".")[0] if isinstance(i, float) else str(i)
if i_str.isdigit():
determine[i_str] = "entrezgene"
elif i_str.startswith("ENS") and (len(i_str) > 11 and all(i.isdigit() for i in i_str[-11:])):
determine[i_str] = "ensembl.gene"
values = (items,) if isinstance(items, str) else items
result: dict[str, Literal["entrezgene", "ensembl.gene", "symbol"]] = {}

for i in values:
s = str(i).partition(".")[0] # remove any transcripts that may exist

if s.startswith("ENS") and len(s) > 11 and s[-11:].isdigit():
result[s] = "ensembl.gene"
elif s.isdigit():
result[s] = "entrezgene"
else:
determine[i_str] = "symbol"
result[s] = "symbol"

if isinstance(items, str):
return result[items]
return result

return determine

def contains_identical_gene_types(values: dict[str, T_MG_SCOPE] | Sequence[T_MG_SCOPE]) -> bool:
"""Check if all values in the input are identical.

:param values: A dictionary mapping gene identifiers to their types or a sequence of gene types.
:return: True if all values are identical, False otherwise.
"""
data = values if not isinstance(values, dict) else list(values.values())
return all(v == data[0] for v in data)
2 changes: 1 addition & 1 deletion main/como/rnaseq_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@

from como.data_types import FilteringTechnique, LogLevel, RNAType
from como.migrations import gene_info_migrations
from como.pipelines.identifier import convert
from como.pipelines.identifier import contains_identical_gene_types, determine_gene_type
from como.project import Config
from como.utils import read_file, set_up_logging

Expand Down
90 changes: 4 additions & 86 deletions main/como/rnaseq_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from loguru import logger

from como.data_types import LogLevel, RNAType
from como.pipelines.identifier import convert
from como.pipelines.identifier import build_gene_info, get_remaining_identifiers
from como.utils import read_file, set_up_logging


Expand Down Expand Up @@ -473,7 +473,7 @@ async def read_ensembl_gene_ids(file: Path) -> list[str]:
if isinstance(data_, pd.DataFrame):
return data_["ensembl_gene_id"].tolist()
try:
conversion = convert(ids=data_.var_names.tolist(), taxon=taxon)
conversion = get_remaining_identifiers(ids=data_.var_names.tolist(), taxon=taxon)
except json.JSONDecodeError as e:
raise ValueError(f"Got a JSON decode error for file '{counts_matrix_filepaths}' ({e})")

Expand All @@ -486,90 +486,8 @@ async def read_ensembl_gene_ids(file: Path) -> list[str]:
"depending on the number of genes and your internet connection"
)

ensembl_ids: set[str] = set(
chain.from_iterable(await asyncio.gather(*[read_ensembl_gene_ids(f) for f in counts_matrix_filepaths]))
)
gene_data: list[dict[str, str | int | list[str] | list[int] | None]] = await MyGene(cache=cache).query(
items=list(ensembl_ids),
taxon=taxon,
scopes="ensemblgene",
)

n = len(gene_data)
all_gene_symbols: list[str] = ["-"] * n
all_entrez_ids: list[str | int] = ["-"] * n
all_ensembl_ids: list[str] = ["-"] * n
all_sizes: list[int] = [-1] * n

def _avg_pos(value: int | list[int] | None) -> int:
if value is None:
return 0
if isinstance(value, list):
return int(sum(value) / len(value)) if value else 0
return int(value)

for i, data in enumerate(gene_data):
data: dict[str, str | int | list[str] | list[int] | None]
if "genomic_pos.start" not in data:
log_and_raise_error(
message="Unexpectedly missing key 'genomic_pos.start'", error=KeyError, level=LogLevel.WARNING
)
if "genomic_pos.end" not in data:
log_and_raise_error(
message="Unexpectedly missing key 'genomic_pos.end'", error=KeyError, level=LogLevel.WARNING
)
if "ensembl.gene" not in data:
log_and_raise_error(
message="Unexpectedly missing key 'ensembl.gene'", error=KeyError, level=LogLevel.WARNING
)

start = data["genomic_pos.start"]
end = data["genomic_pos.end"]
ensembl_id = data["ensembl.gene"]

if not isinstance(start, int):
log_and_raise_error(
message=f"Unexpected type for 'genomic_pos.start': expected int, got {type(start)}",
error=TypeError,
level=LogLevel.WARNING,
)
if not isinstance(end, int):
log_and_raise_error(
message=f"Unexpected type for 'genomic_pos.end': expected int, got {type(start)}",
error=TypeError,
level=LogLevel.WARNING,
)
if not isinstance(ensembl_id, str):
log_and_raise_error(
message=f"Unexpected type for 'ensembl.gene': expected str, got {type(ensembl_id)}",
error=ValueError,
level=LogLevel.WARNING,
)

size = end - start
all_ensembl_ids[i] = ",".join(map(str, ensembl_id)) if isinstance(ensembl_id, list) else ensembl_id
all_gene_symbols[i] = str(data.get("symbol", "-"))
all_entrez_ids[i] = str(data.get("entrezgene", "-"))
all_sizes[i] = max(size, -1) # use `size` otherwise -1

gene_info: pd.DataFrame = pd.DataFrame(
{
"ensembl_gene_id": all_ensembl_ids,
"gene_symbol": all_gene_symbols,
"entrez_gene_id": all_entrez_ids,
"size": all_sizes,
}
)

# remove rows where every gene size value is -1 (not available)
gene_info = gene_info[~(gene_info == -1).all(axis=1)]

gene_info["ensembl_gene_id"] = gene_info["ensembl_gene_id"].str.split(",") # extend lists into multiple rows
gene_info = gene_info.explode(column=["ensembl_gene_id"])
# we would set `entrez_gene_id` to int here as well, but not all ensembl ids are mapped to entrez ids,
# and as a result, there are still "-" values in the entrez id column that cannot be converted to an integer

gene_info = gene_info.sort_values(by="ensembl_gene_id")
ensembl_ids: set[str] = set(chain.from_iterable(read_ensembl_gene_ids(f) for f in counts_matrix_filepaths))
gene_info = build_gene_info(ids=ensembl_ids, taxon=taxon, cache=cache)
output_filepath.parent.mkdir(parents=True, exist_ok=True)
gene_info.to_csv(output_filepath, index=False)
logger.success(f"Gene Info file written at '{output_filepath}'")
Expand Down
Loading
Loading