Downstream analysis¶

SQANTI3
DESeq2
rMATs long
Seurat/Scanpy
SPLISOSM

Seurat/Scanpy¶

PythonR

print("Scanpy!")

print("Seurat!")

SPLISOSM¶

SPLISOSM (SPatiaL ISOform Statistical Modeling) is a Python package for analyzing RNA isoform patterns in spatial transcriptomics (ST) data.
BroCOLI can be used to quantify RNA isoforms in ST data and generate count matrices for SPLISOSM.
You can use BroCOLI's output files (counts_transcript_*.txt) to run SPLISOSM. The SPLISOSM package is available on Github.

Step 0: Prepare the data for SPLISOSM.

gtf_path is the raw genome annotation file (GTF format), used to map between Ensembl gene IDs and gene symbols.
spatial_folder is the path to the folder required for constructing an AnnData object from spatial transcriptomics data. This folder will be automatically generated by SpaceRanger.

Python

from gtfparse import read_gtf
import polars as pl

# Prepare the data
transcript_exper_path = "/data/Data/SPLISOMO/MOB/counts_transcript_0.txt"
gtf_path = "/data/Reference/mouse/refdata-gex-GRCm39-2024-A/genes/genes.gtf"
spatial_folder = "/data/NAR_Mouse_spatial/MOB/GSM4656181_MOB-illumina/spatial"   

# Handling GTF
gtf = read_gtf(gtf_path)
genes = (gtf.filter(pl.col("feature") == "gene").select(["gene_id", "gene_name"]).unique())
id2name = dict(zip(genes["gene_id"].to_list(), genes["gene_name"].to_list()))

Step 1: Create an AnnData object.

Before building the AnnData object, an optional filtering step can be applied to remove transcripts with low expression levels, which helps reduce noise in downstream analyses. The following code does not have any filtering.

Python

import json
import numpy as np
import pandas as pd
import anndata as ad
import scanpy as sc
from PIL import Image

def _harmonize_barcodes(expr_barcodes: pd.Index, pos_barcodes: pd.Index) -> pd.Index:
    """Try to harmonize barcodes between expression matrix and tissue_positions."""
    expr = pd.Index(expr_barcodes.astype(str))
    pos = pd.Index(pos_barcodes.astype(str))
    # If one side has "-1" suffix broadly and the other doesn't, fix expression side
    expr_has_suffix = expr.str.contains(r"-\d+$").mean() > 0.8
    pos_has_suffix = pos.str.contains(r"-\d+$").mean() > 0.8
    if pos_has_suffix and not expr_has_suffix:
        expr2 = expr + "-1"
        return expr2
    if expr_has_suffix and not pos_has_suffix:
        expr2 = expr.str.replace(r"-\d+$", "", regex=True)
        return expr2
    return expr    

def build_visium_anndata_from_tx_counts(
    tx_path: str,
    spatial_dir: str,
    id2name: dict | None = None,
    library_id: str = "slice1",
    positions_file: str = "tissue_positions_list.csv",
    scalefactors_file: str = "scalefactors_json.json",
    hires_image_file: str = "tissue_hires_image.png",
    lowres_image_file: str = "tissue_lowres_image.png",
    sep: str = "\t",
    make_log1p_layer: bool = True,
    target_sum: float = 1e4,
    require_all_barcodes: bool = False,
):
    # ---- 1) Read transcript table ----
    tx = pd.read_csv(tx_path, sep=sep).dropna()
    if tx.shape[1] < 3:
        raise ValueError(
            f"tx_path has <3 columns (got {tx.shape[1]}). Expected transcript_id, gene_id, then spot count columns."
        )
    # a gene more than one transcript
    col2 = tx.columns[1]
    col2times = tx[col2].value_counts()
    tx = tx[tx[col2].isin(col2times[col2times >= 2].index)]        

    transcript_id = tx.iloc[:, 0].astype(str)
    gene_id = tx.iloc[:, 1].astype(str)

    counts = tx.iloc[:, 2:].copy()
    counts.index = transcript_id.values
    counts = counts.apply(pd.to_numeric, errors="raise")

    # AnnData: obs=spots, var=transcripts
    adata = ad.AnnData(X=counts.T)
    adata.layers["counts"] = adata.X.copy()

    adata.var["transcript_id"] = adata.var_names
    adata.var["gene_id"] = gene_id.values
    if id2name is not None:
        adata.var["gene_symbol"] = adata.var["gene_id"].map(id2name)

    # ---- 2) Read tissue positions ----
    pos_path = spatial_dir.rstrip("/") + "/" + positions_file

    # old tissue_positions_list.csv no header
    pos = pd.read_csv(
        pos_path,
        header=None,
        names=[
            "barcode",
            "in_tissue",
            "array_row",
            "array_col",
            "pxl_row_in_fullres",
            "pxl_col_in_fullres",
        ],
    ).set_index("barcode")

    # ---- 3) Harmonize barcodes (fix -1) ----
    expr_barcodes = pd.Index(adata.obs_names)
    pos_barcodes = pd.Index(pos.index)
    expr_barcodes_fixed = _harmonize_barcodes(expr_barcodes, pos_barcodes)

    # apply fix to adata obs_names
    adata.obs_names = expr_barcodes_fixed

    # now align
    expr_barcodes = pd.Index(adata.obs_names)
    missing = expr_barcodes.difference(pos.index)

    if len(missing) > 0:
        msg = (
            f"After harmonizing barcodes, tissue_positions is still missing {len(missing)} barcodes "
            f"(example: {missing[:5].tolist()}). Using intersection only."
        )
        if require_all_barcodes:
            raise ValueError(msg)
        else:
            keep = expr_barcodes.intersection(pos.index)
            if len(keep) == 0:
                raise ValueError(
                    msg + "\nNo overlapping barcodes at all. 你需要检查：表达矩阵列名是否真的是 Visium barcode. "
                )
            adata = adata[keep, :].copy()
            pos = pos.loc[keep]
    else:
        pos = pos.loc[expr_barcodes]

    # ---- 4) Write spatial info ----
    adata.obs["in_tissue"] = pos["in_tissue"].astype(int).values
    adata.obs["array_row"] = pos["array_row"].astype(int).values
    adata.obs["array_col"] = pos["array_col"].astype(int).values
    adata.obsm["spatial"] = pos[["pxl_col_in_fullres", "pxl_row_in_fullres"]].to_numpy()

    # ---- 5) scalefactors + images -> uns['spatial'] ----
    scale_path = spatial_dir.rstrip("/") + "/" + scalefactors_file
    hires_path = spatial_dir.rstrip("/") + "/" + hires_image_file
    lowres_path = spatial_dir.rstrip("/") + "/" + lowres_image_file

    with open(scale_path, "r") as f:
        scalefactors = json.load(f)

    img_hires = np.array(Image.open(hires_path))
    img_lowres = np.array(Image.open(lowres_path))

    adata.uns["spatial"] = {
        library_id: {
            "images": {"hires": img_hires, "lowres": img_lowres},
            "scalefactors": scalefactors,
            "metadata": {},
        }
    }

    # ---- 6) QC + optional log1p layer ----
    sc.pp.calculate_qc_metrics(adata, inplace=True)
    adata.obs["nCount_RNA"] = adata.obs["total_counts"]
    adata.obs["nFeature_RNA"] = adata.obs["n_genes_by_counts"]

    if make_log1p_layer:
        adata.layers["log1p"] = adata.layers["counts"].copy()
        sc.pp.normalize_total(adata, layer="log1p", target_sum=target_sum)
        sc.pp.log1p(adata, layer="log1p")
        adata.uns["log1p"] = {"base": None}

    return adata

Myadata = build_visium_anndata_from_tx_counts(transcript_exper_path, spatial_folder, id2name)

Step 2: The same procedure in SPLISOSM.

These codes are from SPLISOSM. There might be some differences due to version issues. Our current version is installed via pip as v1.0.2.

Python

from splisosm import SplisosmNP

data, _, gene_names, _ = extract_counts_n_ratios(
    Myadata, layer='counts', group_iso_by='gene_symbol'
)
coordinates = Myadata.obs.loc[:, ['array_row', 'array_col']]

# minimal input data, extracted from AnnData as shown above
data = data  # list of length n_gene, each a (n_spot, n_iso) tensor
coordinates = coordinates  # (n_spot, 2), spatial coordinates
gene_names = gene_names  # list of length n_gene, gene names

# initialize the model
# for large datasets, consider setting approx_rank to a smaller value (e.g., 50) to speed up computation
model_np = SplisosmNP()
model_np.setup_data(data, coordinates, gene_names=gene_names, approx_rank=None)

# per-gene test for spatial variability
# method can be 'hsic-ir' (isoform ratio), 'hsic-ic' (isoform counts)
# 'hsic-gc' (gene counts), 'spark-x' (gene counts)
model_np.test_spatial_variability(
    method="hsic-ir",
    ratio_transformation='none',  # only for 'hsic-ir': 'none', 'clr', 'ilr', 'alr', 'radial'
    nan_filling='mean',           # 'mean' (global mean) or 'none' (ignore NaN spots)
)

# extract the per-gene test statistics
df_sv_res = model_np.get_formatted_test_results(test_type='sv')
df_sv_res[df_sv_res["pvalue_adj"] < 0.05]

Header 1	Header 2	Header 3
Row 1	Data	Data
Row 2	Data	Data

This is a blockquote Multiple lines

Nested quote