Melanoma TME (Tirosh et al.)

[1]:

import scanpy as sc
import os
import pandas as pd
import numpy as np
import pickle as pkl
import matplotlib as mpl
import matplotlib.pyplot as plt
import scipy.stats

sc.settings.verbosity = 3

[2]:

import sys
sys.path.insert(0,'..')
import scmer

1. Load and clean data

We download data from https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE72056

Tirosh I, Izar B, Prakadan SM, Wadsworth MH 2nd et al. Dissecting the multicellular ecosystem of metastatic melanoma by single-cell RNA-seq. Science 2016 Apr 8;352(6282):189-96.

1.1 Preprocessing

We first rename duplicate gene names.

[3]:

data = pd.read_table("../../Melanoma/GSE72056_melanoma_single_cell_revised_v2.txt", header=0)
def dedup(x):
    d = {}
    r = []
    for i in x:
        if i in d:
            d[i] += 1
            r.append(i + '_' + str(d[i]))
            print(i, 'renamed to', r[-1])
        else:
            d[i] = 1
            r.append(i)
    return r

data['Cell'] = dedup(data['Cell'])

data = data.T

data.columns = data.loc["Cell"]
data.drop('Cell', inplace=True)
data

MARCH2 renamed to MARCH2_2
MARCH1 renamed to MARCH1_2

[3]:

Cell	tumor	malignant(1=no,2=yes,0=unresolved)	non-malignant cell type (1=T,2=B,3=Macro.4=Endo.,5=CAF;6=NK)	C9orf152	RPS11	ELMO2	CREB3L1	PNMA1	MMP2	TMEM216	...	GPLD1	SNORD115-39	RAB8A	RXFP2	PCIF1	PIK3IP1	SNRPD2	SLC39A6	CTSC	AQP7
Cy72_CD45_H02_S758_comb	72	1	2	0	9.2172	0	0	0	0	0	...	0.62667	0	0	0	0	7.6069	0	0	2.6638	0
CY58_1_CD45_B02_S974_comb	58	1	1	0	8.3745	0	0	0	0	0	...	1.0545	0	0	0	0	0	0	0	6.9901	0
Cy71_CD45_D08_S524_comb	71	2	0	0	9.313	2.1263	0	0	0.73812	0	...	0.99639	0	2.7634	0	3.6782	0	3.9871	3.8777	1.6126	0
Cy81_FNA_CD45_B01_S301_comb	81	2	0	0	7.8876	0	0	0	0	0	...	0.23143	0	4.1937	0	0	0	5.2639	3.766	4.8417	0
Cy80_II_CD45_B07_S883_comb	80	2	0	0	8.3291	0	0	0	0	3.7949	...	0	0	2.5705	0	0	0	6.0824	1.7816	4.4607	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
CY75_1_CD45_CD8_7__S223_comb	75	1	1	0	5.4889	0	0	0	0	0	...	1.2962	0	0	0	0	0	6.0986	0	3.6464	0
CY75_1_CD45_CD8_1__S65_comb	75	1	1	0	4.9262	5.5296	0	0	0	0	...	0.99245	0	0	0	5.5465	3.7384	0	0	7.0004	0
CY75_1_CD45_CD8_1__S93_comb	75	1	1	0	7.0958	0	0	0	0	0	...	0.97516	0	0	0	0	0	0	0	1.9615	0
CY75_1_CD45_CD8_1__S76_comb	75	1	1	0	3.997	0	0	0	0	0	...	0.49208	0	0	0	0	0	0	0	7.1918	0
CY75_1_CD45_CD8_7__S274_comb	75	1	0	0	3.9897	0	0	0	0	0.29007	...	1.0682	0	0	0	5.0081	6.7535	5.9313	5.2398	6.1588	0

4645 rows × 23689 columns

We then separate metadata from gene expression and remove ERCC spike-ins.

[4]:

obs = data[['tumor', 'malignant(1=no,2=yes,0=unresolved)', 'non-malignant cell type (1=T,2=B,3=Macro.4=Endo.,5=CAF;6=NK)']]
data.drop(['tumor', 'malignant(1=no,2=yes,0=unresolved)', 'non-malignant cell type (1=T,2=B,3=Macro.4=Endo.,5=CAF;6=NK)'],
          axis=1, inplace=True)
print(*data.columns[data.columns.str.startswith('ERCC')], 'are dropped')
data = data.loc[:, data.columns.str.startswith('ERCC') == False]

obs

ERCC5 ERCC8 ERCC1 ERCC4 ERCC6L ERCC6L2 ERCC3 ERCC6 ERCC2 are dropped

[4]:

Cell	tumor	malignant(1=no,2=yes,0=unresolved)	non-malignant cell type (1=T,2=B,3=Macro.4=Endo.,5=CAF;6=NK)
Cy72_CD45_H02_S758_comb	72	1	2
CY58_1_CD45_B02_S974_comb	58	1	1
Cy71_CD45_D08_S524_comb	71	2	0
Cy81_FNA_CD45_B01_S301_comb	81	2	0
Cy80_II_CD45_B07_S883_comb	80	2	0
...	...	...	...
CY75_1_CD45_CD8_7__S223_comb	75	1	1
CY75_1_CD45_CD8_1__S65_comb	75	1	1
CY75_1_CD45_CD8_1__S93_comb	75	1	1
CY75_1_CD45_CD8_1__S76_comb	75	1	1
CY75_1_CD45_CD8_7__S274_comb	75	1	0

4645 rows × 3 columns

[5]:

pd.crosstab(obs['malignant(1=no,2=yes,0=unresolved)'], obs['non-malignant cell type (1=T,2=B,3=Macro.4=Endo.,5=CAF;6=NK)'])

[5]:

non-malignant cell type (1=T,2=B,3=Macro.4=Endo.,5=CAF;6=NK)	0.0	1.0	2.0	3.0	4.0	5.0	6.0
malignant(1=no,2=yes,0=unresolved)
0.0	90	24	3	6	3	5	1
1.0	416	2040	512	119	62	56	51
2.0	1252	4	0	1	0	0	0

[6]:

# Suppress SettingWithCopyWarning
import warnings
import pandas as pd
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

# Rename cell types for better appearance

malignant_dict = ['unresolved', 'normal', 'malignant']
celltype_dict = ['bordercase', 'T', 'B', 'Macro', 'Endo', 'CAF', 'NK']
obs['tumor'] = obs['tumor'].apply(lambda x: 'Mel' + str(int(x)))
obs['malignant'] = obs['malignant(1=no,2=yes,0=unresolved)'].astype(int).apply(malignant_dict.__getitem__)
obs['celltype'] = obs['non-malignant cell type (1=T,2=B,3=Macro.4=Endo.,5=CAF;6=NK)'].astype(int).apply(celltype_dict.__getitem__)
obs.loc[obs['malignant'] == 'malignant', 'celltype'] = 'malignant'

1.2 Create AnnData for scanpy

[7]:

adata = sc.AnnData(data)
for i in obs.columns:
    adata.obs[i] = obs[i]

[8]:

adata

[8]:

AnnData object with n_obs × n_vars = 4645 × 23677
    obs: 'tumor', 'malignant(1=no,2=yes,0=unresolved)', 'non-malignant cell type (1=T,2=B,3=Macro.4=Endo.,5=CAF;6=NK)', 'malignant', 'celltype'

2 Typical SCANPY analysis

2.1 QC

[9]:

sc.settings.set_figure_params(dpi=50, facecolor='white')
sc.pl.highest_expr_genes(adata, n_top=20)

normalizing counts per cell
    finished (0:00:00)

No mitochonrial genes available, which is okay.

[10]:

sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts'], jitter=0.4, multi_panel=True)

C:\Users\SLiang3\Miniconda3\envs\scanpy37\lib\site-packages\anndata\_core\anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
  if is_string_dtype(df[key]) and not is_categorical(df[key])
... storing 'tumor' as categorical
... storing 'malignant(1=no,2=yes,0=unresolved)' as categorical
... storing 'non-malignant cell type (1=T,2=B,3=Macro.4=Endo.,5=CAF;6=NK)' as categorical
... storing 'malignant' as categorical
... storing 'celltype' as categorical
C:\Users\SLiang3\Miniconda3\envs\scanpy37\lib\site-packages\seaborn\_core.py:1303: UserWarning: Vertical orientation ignored with only `x` specified.
  warnings.warn(single_var_warning.format("Vertical", "x"))
C:\Users\SLiang3\Miniconda3\envs\scanpy37\lib\site-packages\seaborn\_core.py:1303: UserWarning: Vertical orientation ignored with only `x` specified.
  warnings.warn(single_var_warning.format("Vertical", "x"))
C:\Users\SLiang3\Miniconda3\envs\scanpy37\lib\site-packages\seaborn\_core.py:1303: UserWarning: Vertical orientation ignored with only `x` specified.
  warnings.warn(single_var_warning.format("Vertical", "x"))
C:\Users\SLiang3\Miniconda3\envs\scanpy37\lib\site-packages\seaborn\_core.py:1303: UserWarning: Vertical orientation ignored with only `x` specified.
  warnings.warn(single_var_warning.format("Vertical", "x"))

[11]:

sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts')

[12]:

sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)

C:\Users\SLiang3\Miniconda3\envs\scanpy37\lib\site-packages\anndata\_core\anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
  if not is_categorical(df_full[k]):

filtered out 1397 genes that are detected in less than 3 cells

2.2 Normalization, HVG Selection, and Scaling

[13]:

sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

normalizing counts per cell
    finished (0:00:00)

[14]:

sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)

extracting highly variable genes
    finished (0:00:01)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)

[15]:

sc.pl.highly_variable_genes(adata)

[16]:

adata.raw = adata

[17]:

adata = adata[:, adata.var.highly_variable]

[18]:

sc.pp.scale(adata, max_value=10)

C:\Users\SLiang3\Miniconda3\envs\scanpy37\lib\site-packages\scanpy\preprocessing\_simple.py:806: UserWarning: Revieved a view of an AnnData. Making a copy.
  view_to_actual(adata)

[19]:

adata

[19]:

AnnData object with n_obs × n_vars = 4645 × 6219
    obs: 'tumor', 'malignant(1=no,2=yes,0=unresolved)', 'non-malignant cell type (1=T,2=B,3=Macro.4=Endo.,5=CAF;6=NK)', 'malignant', 'celltype', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'n_genes'
    var: 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'log1p', 'hvg'

2.3 Dimensional Reduction

[20]:

sc.tl.pca(adata, svd_solver='arpack')

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:01)

[21]:

sc.pl.pca(adata, color=['malignant', 'celltype', 'tumor'])

[22]:

sc.pl.pca_variance_ratio(adata, log=True)

[23]:

sc.pp.neighbors(adata, n_neighbors=10, n_pcs=20)
sc.tl.umap(adata)
sc.pl.umap(adata, color=['malignant', 'celltype', 'tumor'])

computing neighbors
    using 'X_pca' with n_pcs = 20
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:03)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:05)

3. SCMER Feature Selection

Each step contains 20 inner iterations. It can be seen that the algorithm converges in 40~60 iternations.

As a rule of thumb, the loss should end up with 3.0~4.0. 0.0 is perfect but it needs many features. A loss greater than 4.0 usually mean that the manifold is not well retained.

[24]:

model = scmer.UmapL1(lasso=3.87e-4, ridge=0., n_pcs=40, perplexity=100., use_beta_in_Q=True, n_threads=6, pca_seed=2020)
model.fit(adata.X)

Calculating distance matrix and scaling factors...
Computing pairwise distances...
Using 6 threads...
Mean value of sigma: 1.206104
Done. Elapsed time: 20.78 seconds. Total: 20.78 seconds.
Creating model without batches...
Optimizing using OWLQN (because lasso is nonzero)...
0 loss: 6.253872871398926 Nonzero: 80 Elapsed time: 89.79 seconds. Total: 110.58 seconds.
1 loss: 3.090698003768921 Nonzero: 76 Elapsed time: 89.34 seconds. Total: 199.92 seconds.
2 loss: 3.0756547451019287 Nonzero: 75 Elapsed time: 102.79 seconds. Total: 302.71 seconds.
3 loss: 3.0753679275512695 Nonzero: 75 Elapsed time: 108.61 seconds. Total: 411.32 seconds.
4 loss: 3.0753376483917236 Nonzero: 75 Elapsed time: 85.93 seconds. Total: 497.25 seconds.
final loss: 2.9645748138427734 Nonzero: 75 Elapsed time: 1.45 seconds. Total: 498.70 seconds.

[24]:

<scmer._umap_l1.UmapL1 at 0x24547e99dc8>

Selected markers:

[25]:

print(*adata.var_names[model.get_mask()])

RPS11 CXCL13 B2M GNLY SERPINA3 CD55 TXNIP DEDD RPS4Y1 RGS1 PTPRC PDCD1 HAVCR2 A2M TYR COL1A2 VIM SRGN ELK2AP NME2 GPR183 CDH5 PAEP KIT CCR7 TCL1A ANXA1 C1orf56 NKG7 MFAP4 GZMH MTRNR2L2 GZMA CD52 RGS10 CXCR4 MIR4461 COL1A1 TYMS MCM5 OAS2 PTPRCAP PMEL SPOCK2 PRAME AURKB MIA SELPLG APOD UXS1 IFITM1 MAGEC2 TYROBP IRF8 CD69 CD8A TNFAIP3 HLA-DQA1 TIGIT CD27 MS4A1 LSP1 TCF7 C10orf54 SYTL3 TOB1 CCL4 TOX PRDM1 CTLA4 GZMB HAPLN3 HLA-DQB1 HLA-DPA1 LTB

4. Validation

4.1 PCA and UMAP validation

The PCA and UMAP largely resemble the ones obtained from all genes.

[26]:

new_adata = model.transform(adata)
sc.tl.pca(new_adata, svd_solver='arpack')
sc.pl.pca(new_adata, color=['malignant', 'celltype', 'tumor'])
sc.pp.neighbors(new_adata, n_pcs=10, use_rep="X_pca")
sc.tl.umap(new_adata)
sc.pl.umap(new_adata, color=['malignant', 'celltype', 'tumor'], frameon=False)

computing PCA
    on highly variable genes
    with n_comps=50

C:\Users\SLiang3\Miniconda3\envs\scanpy37\lib\site-packages\anndata\_core\anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
  if not is_categorical(df_full[k]):

    finished (0:00:00)

computing neighbors
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:06)

4.2 Gene sets analysis

4.2.1 Gene sets from the original publication

[27]:

import pickle as pkl
with open("../../Melanoma/markers.pkl", 'rb') as file:
    markers = pkl.load(file)

selected_markers = adata.var_names[model.get_mask()].tolist()
all_markers = adata.var_names.tolist()

for i in markers:
    print(i, len(set(markers[i]).intersection(set(selected_markers))), '/',
          len(set(markers[i]).intersection(set(all_markers))), "(", len(markers[i]), "):", end=" ")
    print(*set(markers[i]).intersection(set(selected_markers)))

axl 2 / 52 ( 100 ): HAPLN3 CD52
mitf 3 / 28 ( 100 ): TOB1 PMEL TYR
cell-cycle 3 / 54 ( 93 ): TYMS AURKB MCM5
melanoma 5 / 19 ( 47 ): TYR PRAME PMEL SERPINA3 MIA
b 2 / 27 ( 31 ): IRF8 MS4A1
t 7 / 38 ( 38 ): TCF7 PRDM1 CD8A NKG7 SPOCK2 TOX TIGIT
macro 1 / 73 ( 92 ): TYROBP
endo 2 / 45 ( 95 ): HAPLN3 CDH5
caf 3 / 46 ( 88 ): COL1A1 MFAP4 COL1A2
exhaustion-consistent 3 / 17 ( 28 ): CXCL13 CD27 TIGIT
exhaustion_variable 7 / 159 ( 272 ): HLA-DPA1 TOX SRGN CTLA4 HAVCR2 PDCD1 IRF8

4.2.2 Genes not in the gene sets above

[28]:

temp = set(selected_markers.copy())
for i in markers:
    temp -= set(markers[i])

print(*temp)

NME2 MIR4461 RGS1 KIT CCR7 HLA-DQB1 OAS2 GZMA B2M APOD TXNIP IFITM1 C10orf54 DEDD ANXA1 GZMB RGS10 LSP1 TNFAIP3 MTRNR2L2 UXS1 SELPLG TCL1A PAEP C1orf56 ELK2AP GZMH A2M GNLY CD55 VIM SYTL3 PTPRCAP CD69 PTPRC CXCR4 CCL4 RPS11 GPR183 HLA-DQA1 LTB RPS4Y1 MAGEC2

4.2.3 Genes in EMT pathway

[29]:

emt_markers = scmer.Comparison.read_gmt(
    "../../Melanoma/HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION.gmt")['HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION']

scmer.Comparison.compare(y_true=emt_markers, y_pred=selected_markers)

[29]:

'4/200 recalled in the gene set by 75 predicted genes, overlapping genes are COL1A1, VIM, COL1A2, TNFAIP3.'

4.3 Scatter Plot of the Selected Features

[30]:

sc.settings.set_figure_params(dpi=50, facecolor='white')
sc.pl.umap(adata, color=selected_markers, ncols=5, legend_loc="on data", legend_fontsize=8., color_map="inferno")

5. Session Info

[31]:

sc.logging.print_versions()

WARNING: If you miss a compact list, please try `print_header`!
-----
anndata     0.7.4
scanpy      1.6.0
sinfo       0.3.1
-----
PIL                 7.2.0
anndata             0.7.4
backcall            0.2.0
cairo               1.19.1
cffi                1.14.2
colorama            0.4.3
cycler              0.10.0
cython_runtime      NA
dateutil            2.8.1
decorator           4.4.2
get_version         2.1
h5py                2.10.0
igraph              0.8.2
importlib_metadata  1.7.0
ipykernel           5.3.4
ipython_genutils    0.2.0
ipywidgets          7.5.1
jedi                0.17.2
joblib              0.16.0
kiwisolver          1.2.0
legacy_api_wrap     1.2
leidenalg           0.8.1
llvmlite            0.33.0+1.g022ab0f
matplotlib          3.3.1
mkl                 2.3.0
mpl_toolkits        NA
natsort             7.0.1
nt                  NA
ntsecuritycon       NA
numba               0.50.1
numexpr             2.7.1
numpy               1.19.1
packaging           20.4
pandas              1.1.1
parso               0.7.0
pickleshare         0.7.5
pkg_resources       NA
prompt_toolkit      3.0.7
pycparser           2.20
pygments            2.6.1
pyparsing           2.4.7
pythoncom           NA
pytz                2020.1
pywintypes          NA
scanpy              1.6.0
scipy               1.5.2
scmer               NA
seaborn             0.11.0
setuptools_scm      NA
sinfo               0.3.1
six                 1.15.0
sklearn             0.23.2
sphinxcontrib       NA
statsmodels         0.11.1
storemagic          NA
tables              3.6.1
texttable           1.6.2
torch               1.6.0
tornado             6.0.4
tqdm                4.48.2
traitlets           4.3.3
umap                0.4.6
wcwidth             0.2.5
win32api            NA
win32com            NA
win32security       NA
zipp                NA
zmq                 19.0.1
-----
IPython             7.18.1
jupyter_client      6.1.6
jupyter_core        4.6.3
notebook            6.1.1
-----
Python 3.7.7 (default, May  6 2020, 11:45:54) [MSC v.1916 64 bit (AMD64)]
Windows-10-10.0.17763-SP0
12 logical CPU cores, Intel64 Family 6 Model 158 Stepping 10, GenuineIntel
-----
Session information updated at 2020-11-11 19:54

[ ]: