Downstream analysis¶
Seurat/Scanpy¶
SPLISOSM¶
- SPLISOSM (SPatiaL ISOform Statistical Modeling) is a Python package for analyzing RNA isoform patterns in spatial transcriptomics (ST) data.
- BroCOLI can be used to quantify RNA isoforms in ST data and generate count matrices for SPLISOSM.
- You can use BroCOLI's output files (counts_transcript_*.txt) to run SPLISOSM. The SPLISOSM package is available on Github.
Step 0: Prepare the data for SPLISOSM.
- gtf_path is the raw genome annotation file (GTF format), used to map between Ensembl gene IDs and gene symbols.
- spatial_folder is the path to the folder required for constructing an AnnData object from spatial transcriptomics data. This folder will be automatically generated by SpaceRanger.
from gtfparse import read_gtf
import polars as pl
# Prepare the data
transcript_exper_path = "/data/Data/SPLISOMO/MOB/counts_transcript_0.txt"
gtf_path = "/data/Reference/mouse/refdata-gex-GRCm39-2024-A/genes/genes.gtf"
spatial_folder = "/data/NAR_Mouse_spatial/MOB/GSM4656181_MOB-illumina/spatial"
# Handling GTF
gtf = read_gtf(gtf_path)
genes = (gtf.filter(pl.col("feature") == "gene").select(["gene_id", "gene_name"]).unique())
id2name = dict(zip(genes["gene_id"].to_list(), genes["gene_name"].to_list()))
Step 1: Create an AnnData object.
Before building the AnnData object, an optional filtering step can be applied to remove transcripts with low expression levels, which helps reduce noise in downstream analyses. The following code does not have any filtering.
import json
import numpy as np
import pandas as pd
import anndata as ad
import scanpy as sc
from PIL import Image
def _harmonize_barcodes(expr_barcodes: pd.Index, pos_barcodes: pd.Index) -> pd.Index:
"""Try to harmonize barcodes between expression matrix and tissue_positions."""
expr = pd.Index(expr_barcodes.astype(str))
pos = pd.Index(pos_barcodes.astype(str))
# If one side has "-1" suffix broadly and the other doesn't, fix expression side
expr_has_suffix = expr.str.contains(r"-\d+$").mean() > 0.8
pos_has_suffix = pos.str.contains(r"-\d+$").mean() > 0.8
if pos_has_suffix and not expr_has_suffix:
expr2 = expr + "-1"
return expr2
if expr_has_suffix and not pos_has_suffix:
expr2 = expr.str.replace(r"-\d+$", "", regex=True)
return expr2
return expr
def build_visium_anndata_from_tx_counts(
tx_path: str,
spatial_dir: str,
id2name: dict | None = None,
library_id: str = "slice1",
positions_file: str = "tissue_positions_list.csv",
scalefactors_file: str = "scalefactors_json.json",
hires_image_file: str = "tissue_hires_image.png",
lowres_image_file: str = "tissue_lowres_image.png",
sep: str = "\t",
make_log1p_layer: bool = True,
target_sum: float = 1e4,
require_all_barcodes: bool = False,
):
# ---- 1) Read transcript table ----
tx = pd.read_csv(tx_path, sep=sep).dropna()
if tx.shape[1] < 3:
raise ValueError(
f"tx_path has <3 columns (got {tx.shape[1]}). Expected transcript_id, gene_id, then spot count columns."
)
# a gene more than one transcript
col2 = tx.columns[1]
col2times = tx[col2].value_counts()
tx = tx[tx[col2].isin(col2times[col2times >= 2].index)]
transcript_id = tx.iloc[:, 0].astype(str)
gene_id = tx.iloc[:, 1].astype(str)
counts = tx.iloc[:, 2:].copy()
counts.index = transcript_id.values
counts = counts.apply(pd.to_numeric, errors="raise")
# AnnData: obs=spots, var=transcripts
adata = ad.AnnData(X=counts.T)
adata.layers["counts"] = adata.X.copy()
adata.var["transcript_id"] = adata.var_names
adata.var["gene_id"] = gene_id.values
if id2name is not None:
adata.var["gene_symbol"] = adata.var["gene_id"].map(id2name)
# ---- 2) Read tissue positions ----
pos_path = spatial_dir.rstrip("/") + "/" + positions_file
# old tissue_positions_list.csv no header
pos = pd.read_csv(
pos_path,
header=None,
names=[
"barcode",
"in_tissue",
"array_row",
"array_col",
"pxl_row_in_fullres",
"pxl_col_in_fullres",
],
).set_index("barcode")
# ---- 3) Harmonize barcodes (fix -1) ----
expr_barcodes = pd.Index(adata.obs_names)
pos_barcodes = pd.Index(pos.index)
expr_barcodes_fixed = _harmonize_barcodes(expr_barcodes, pos_barcodes)
# apply fix to adata obs_names
adata.obs_names = expr_barcodes_fixed
# now align
expr_barcodes = pd.Index(adata.obs_names)
missing = expr_barcodes.difference(pos.index)
if len(missing) > 0:
msg = (
f"After harmonizing barcodes, tissue_positions is still missing {len(missing)} barcodes "
f"(example: {missing[:5].tolist()}). Using intersection only."
)
if require_all_barcodes:
raise ValueError(msg)
else:
keep = expr_barcodes.intersection(pos.index)
if len(keep) == 0:
raise ValueError(
msg + "\nNo overlapping barcodes at all. 你需要检查:表达矩阵列名是否真的是 Visium barcode. "
)
adata = adata[keep, :].copy()
pos = pos.loc[keep]
else:
pos = pos.loc[expr_barcodes]
# ---- 4) Write spatial info ----
adata.obs["in_tissue"] = pos["in_tissue"].astype(int).values
adata.obs["array_row"] = pos["array_row"].astype(int).values
adata.obs["array_col"] = pos["array_col"].astype(int).values
adata.obsm["spatial"] = pos[["pxl_col_in_fullres", "pxl_row_in_fullres"]].to_numpy()
# ---- 5) scalefactors + images -> uns['spatial'] ----
scale_path = spatial_dir.rstrip("/") + "/" + scalefactors_file
hires_path = spatial_dir.rstrip("/") + "/" + hires_image_file
lowres_path = spatial_dir.rstrip("/") + "/" + lowres_image_file
with open(scale_path, "r") as f:
scalefactors = json.load(f)
img_hires = np.array(Image.open(hires_path))
img_lowres = np.array(Image.open(lowres_path))
adata.uns["spatial"] = {
library_id: {
"images": {"hires": img_hires, "lowres": img_lowres},
"scalefactors": scalefactors,
"metadata": {},
}
}
# ---- 6) QC + optional log1p layer ----
sc.pp.calculate_qc_metrics(adata, inplace=True)
adata.obs["nCount_RNA"] = adata.obs["total_counts"]
adata.obs["nFeature_RNA"] = adata.obs["n_genes_by_counts"]
if make_log1p_layer:
adata.layers["log1p"] = adata.layers["counts"].copy()
sc.pp.normalize_total(adata, layer="log1p", target_sum=target_sum)
sc.pp.log1p(adata, layer="log1p")
adata.uns["log1p"] = {"base": None}
return adata
Step 2: The same procedure in SPLISOSM.
These codes are from SPLISOSM. There might be some differences due to version issues. Our current version is installed via pip as v1.0.2.
from splisosm import SplisosmNP
data, _, gene_names, _ = extract_counts_n_ratios(
Myadata, layer='counts', group_iso_by='gene_symbol'
)
coordinates = Myadata.obs.loc[:, ['array_row', 'array_col']]
# minimal input data, extracted from AnnData as shown above
data = data # list of length n_gene, each a (n_spot, n_iso) tensor
coordinates = coordinates # (n_spot, 2), spatial coordinates
gene_names = gene_names # list of length n_gene, gene names
# initialize the model
# for large datasets, consider setting approx_rank to a smaller value (e.g., 50) to speed up computation
model_np = SplisosmNP()
model_np.setup_data(data, coordinates, gene_names=gene_names, approx_rank=None)
# per-gene test for spatial variability
# method can be 'hsic-ir' (isoform ratio), 'hsic-ic' (isoform counts)
# 'hsic-gc' (gene counts), 'spark-x' (gene counts)
model_np.test_spatial_variability(
method="hsic-ir",
ratio_transformation='none', # only for 'hsic-ir': 'none', 'clr', 'ilr', 'alr', 'radial'
nan_filling='mean', # 'mean' (global mean) or 'none' (ignore NaN spots)
)
# extract the per-gene test statistics
df_sv_res = model_np.get_formatted_test_results(test_type='sv')
df_sv_res[df_sv_res["pvalue_adj"] < 0.05]
| Header 1 | Header 2 | Header 3 |
|---|---|---|
| Row 1 | Data | Data |
| Row 2 | Data | Data |
This is a blockquote Multiple lines
Nested quote