# -*- coding: utf-8 -*-
'''Functions for performing network colocalization
'''
import warnings
import logging
# External library imports
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.spatial import distance
# NEW:
import ipycytoscape
import ipywidgets as widgets
from scipy.stats import hypergeom
from scipy.stats import norm
import random as rn
from tqdm import tqdm
import math
from netcoloc.netprop_zscore import *
logger = logging.getLogger(__name__)
[docs]
def calculate_network_overlap(z_scores_1, z_scores_2, z_score_threshold=3,
z1_threshold=1.5, z2_threshold=1.5):
"""
Function to determine which genes overlap. Returns a list of the
overlapping genes
:param z_scores_1: Result from :py:func:`~netcoloc.netprop_zscore.netprop_zscore`
or :py:func:`~netcoloc.netprop_zscore.calculate_heat_zscores`
containing the z-scores of each gene following network
propagation. The index consists of gene names
:type z_scores_1: :py:class:`pandas.Series`, :py:class:`pandas.DataFrame`, :py:class:`numpy.ndarray`
:param z_scores_2: Similar to **z_scores_1**. This and **z_scores_1**
must contain the same genes (ie. come from the same
interactome network)
:type z_scores_2: :py:class:`pandas.Series`, :py:class:`pandas.DataFrame`, :py:class:`numpy.ndarray`
:param z_score_threshold: threshold to determine whether a gene is
a part of the network overlap or not. Genes with combined z-scores
below this threshold will be discarded
:type z_score_threshold: float
:param z1_threshold: individual z1-score threshold to determine whether a gene is
a part of the network overlap or not. Genes with z1-scores
below this threshold will be discarded
:type z1_threshold: float
:param z2_threshold: individual z2-score threshold to determine whether a gene is
a part of the network overlap or not. Genes with z2-scores
below this threshold will be discarded
:type z2_threshold: float
:return: genes in the network overlap (genes with high combined
z-scores)
:rtype: list
"""
# add a warning if z_score_threshold < z1_threshold or z2_threshold
if z_score_threshold <= z1_threshold*z2_threshold:
warnings.warn(f'z_score_threshold ({z_score_threshold}) is less than or equal to z1_threshold ({z1_threshold}) * z2_threshold ({z2_threshold}). '
f'The combined threshold will have no effect.')
if z1_threshold != z2_threshold:
warnings.warn(f'z1_threshold ({z1_threshold}) is not equal to z2_threshold ({z2_threshold}). '
f'Using the same threshold for both individual scores is recommended.')
if isinstance(z_scores_1, pd.Series):
z_scores_1 = z_scores_1.to_frame(name='z_scores_1')
z_scores_2 = z_scores_2.to_frame(name='z_scores_2')
elif isinstance(z_scores_1, np.ndarray):
z_scores_1 = pd.DataFrame({"z_scores_1":z_scores_1})
z_scores_2 = pd.DataFrame({"z_scores_2":z_scores_2})
else:
z_scores_1.columns = ["z_scores_1"]
z_scores_2.columns = ["z_scores_2"]
assert len(z_scores_1) == len(z_scores_2), f'z_scores_1 and z_scores_2 must have the same length. Got Z1 :{len(z_scores_1)} and Z2:{len(z_scores_2)}'
z_scores_joined = z_scores_1.join(z_scores_2)
z_scores_combined = (z_scores_joined['z_scores_1']
* z_scores_joined['z_scores_2']
* (z_scores_joined['z_scores_1'] > 0)
* (z_scores_joined['z_scores_2'] > 0))
# get rid of unlikely genes which have low scores in either z1 or z2
high_z_score_genes = z_scores_combined[
(z_scores_combined >= z_score_threshold)
& (z_scores_joined['z_scores_1'] > z1_threshold)
& (z_scores_joined['z_scores_2'] > z2_threshold)
].index.tolist()
return high_z_score_genes
[docs]
def calculate_network_overlap_subgraph(interactome, z_scores_1,
z_scores_2, z_score_threshold=3,
z1_threshold=1.5,z2_threshold=1.5):
"""
Function to return subgraph of network intersection.
Code to create subgraph is from
`NetworkX subgraph documentation <https://networkx.org/documentation/stable/reference/classes/generated/networkx.Graph.subgraph.html>`__
:param interactome: network whose subgraph will be returned
:type interactome: :py:class:`networkx.Graph`
:param z_scores_1: Result from :py:func:`~netcoloc.netprop_zscore.netprop_zscore`
or :py:func:`~netcoloc.netprop_zscore.calculate_heat_zscores`
containing the z-scores of each gene following network
propagation. The index consists of gene names
:type z_scores_1: :py:class:`pandas.Series`
:param z_scores_2: Similar to **z_scores_1**. This and **z_scores_1**
must contain the same genes (ie. come celfrom the same
interactome network)
:type z_scores_2: :py:class:`pandas.Series`
:param z_score_threshold: threshold to determine whether a gene is
a part of the network overlap or not. Genes with combined z-scores
below this threshold will be discarded
:type z_score_threshold: float
:param z1_threshold: individual z1-score threshold to determine whether a gene is
a part of the network overlap or not. Genes with z1-scores
below this threshold will be discarded
:type z1_threshold: float
:param z2_threshold: individual z2-score threshold to determine whether a gene is
a part of the network overlap or not. Genes with z2-scores
below this threshold will be discarded
:type z2_threshold: float
:return: Subgraph of the interactome containing only genes that
are in the network intersection (genes with high combined z-scores)
:rtype: :py:class:`networkx.Graph`
"""
network_overlap = calculate_network_overlap(z_scores_1, z_scores_2, z_score_threshold=z_score_threshold,
z1_threshold=z1_threshold,z2_threshold=z2_threshold)
if len(network_overlap) > 0:
assert all(node in interactome.nodes() for node in network_overlap), f'Not all nodes in the network overlap are present in the interactome. Missing nodes: {set(network_overlap) - set(interactome.nodes())}'
# Create subgraph that has the same type as original graph
network_overlap_subgraph = interactome.__class__()
network_overlap_subgraph.add_nodes_from((node, interactome.nodes[node]) for node in network_overlap)
if network_overlap_subgraph.is_multigraph():
network_overlap_subgraph.add_edges_from((node, neighbor, key, dictionary)
for node, neighbors in interactome.adj.items() if node in network_overlap
for neighbor, item in neighbors.items() if neighbor in network_overlap
for key, dictionary in item.items())
else:
network_overlap_subgraph.add_edges_from((node, neighbor, dictionary)
for node, neighbors in interactome.adj.items() if node in network_overlap
for neighbor, dictionary in neighbors.items() if neighbor in network_overlap)
network_overlap_subgraph.graph.update(interactome.graph)
return network_overlap_subgraph
[docs]
def calculate_expected_overlap(z_scores_1, z_scores_2, seed1=None, seed2=None,
z_score_threshold=3, z1_threshold=1.5,
z2_threshold=1.5, overlap_control=None,
num_reps=1000, plot=False):
"""
Determines size of expected network overlap by randomly
shuffling gene names
:param z_scores_1: Result from :py:func:`~netcoloc.netprop_zscore.netprop_zscore`
or :py:func:`~netcoloc.netprop_zscore.calculate_heat_zscores`
containing the z-scores of each gene following network
propagation. The index consists of gene names
:type z_scores_1: :py:class:`pandas.Series`
:param z_scores_2: Similar to **z_scores_1**. This and **z_scores_1**
must contain the same genes (ie. come from the same
interactome network)
:type z_scores_2: :py:class:`pandas.Series`
:param z_score_threshold: threshold to determine whether a gene is
a part of the network overlap or not. Genes with combined z-scores
below this threshold will be discarded
:type z_score_threshold: float
:param z1_threshold: individual z1-score threshold to determine whether a gene is
a part of the network overlap or not. Genes with z1-scores
below this threshold will be discarded
:type z1_threshold: float
:param z2_threshold: individual z2-score threshold to determine whether a gene is
a part of the network overlap or not. Genes with z2-scores
below this threshold will be discarded
:type z2_threshold: float
:param num_reps: Number of repitions of randomly shuffling input z_score vectors to generate null distribution
:type num_reps: int
:param plot: If ``True``, distribution will be plotted
:type plot: bool
:return: Observed overlap size, and vector of randomized overlap sizes from permuted z_scores
:rtype: float, np.array of floats
"""
# Build a distribution of expected network overlap sizes by shuffling node names
random_network_overlap_sizes = []
if isinstance(z_scores_1, pd.Series):
z_scores_1 = pd.DataFrame(z_scores_1, columns=["z"])
if isinstance(z_scores_2, pd.Series):
z_scores_2 = pd.DataFrame(z_scores_2, columns=["z"])
z_scores_1_copy = z_scores_1.copy()
z_scores_2_copy = z_scores_2.copy()
z1z2 = z_scores_1.join(z_scores_2, lsuffix="1", rsuffix="2")
z1z2 = z1z2.assign(zz=z1z2.z1 * z1z2.z2)
if overlap_control is not None:
if overlap_control == "remove":
seed_overlap = list(set(seed1).intersection(set(seed2)))
print("Overlap seed genes:", len(seed_overlap))
z1z2.drop(seed_overlap, axis=0, inplace=True)
if overlap_control == "bin":
seed_overlap = list(set(seed1).intersection(set(seed2)))
print("Overlap seed genes:", len(seed_overlap))
overlap_z1z2 = z1z2.loc[seed_overlap]
overlap_z1 = np.array(overlap_z1z2.z1)
overlap_z2 = np.array(overlap_z1z2.z2)
z1z2.drop(seed_overlap, axis=0, inplace=True)
z1 = np.array(z1z2.z1)
z2 = np.array(z1z2.z2)
network_overlap_size = len(calculate_network_overlap(z1, z2,
z_score_threshold=z_score_threshold,
z1_threshold=z1_threshold,
z2_threshold=z2_threshold))
if overlap_control == "bin": # if overlapping seed genes were separated add their contribution
network_overlap_size += len(calculate_network_overlap(overlap_z1z2.z1, overlap_z1z2.z2,
z_score_threshold=z_score_threshold,
z1_threshold=z1_threshold,
z2_threshold=z2_threshold))
random_network_overlap_sizes = np.zeros(num_reps)
for i in tqdm(range(num_reps)):
perm_z1z2 = np.zeros(len(z1))
rn.shuffle(z1)
perm_size = len(calculate_network_overlap(z1, z2,
z_score_threshold=z_score_threshold,
z1_threshold=z1_threshold,
z2_threshold=z2_threshold))
if overlap_control == "bin": # perform the separate permutation
overlap_perm_z1z2 = np.zeros(len(overlap_z1))
rn.shuffle(overlap_z1)
perm_size_overlap = len(calculate_network_overlap(overlap_z1, overlap_z2,
z_score_threshold=z_score_threshold,
z1_threshold=z1_threshold,
z2_threshold=z2_threshold))
perm_size += perm_size_overlap # add to the size
random_network_overlap_sizes[i] = perm_size
if plot:
plt.figure(figsize=(5, 4))
dfig = sns.histplot(random_network_overlap_sizes,
label='Expected network intersection size')
plt.vlines(network_overlap_size, ymin=0, ymax=dfig.dataLim.bounds[3], color='r',
label='Observed network intersection size')
plt.xlabel('Size of proximal subgraph, z > ' + str(z_score_threshold),
fontsize=16)
plt.legend(fontsize=12)
return network_overlap_size, random_network_overlap_sizes
[docs]
def calculate_mean_z_score_distribution(z1, z2, num_reps=1000, zero_double_negatives=True,
overlap_control="remove", seed1=[], seed2=[]):
"""Determines size of expected mean combined `z=z1*z2` by randomly shuffling gene names
Args:
z1 (pd.Series, pd.DataFrame): Vector of z-scores from network propagation of trait 1
z2 (pd.Series, pd.DataFrame): Vector of z-scores from network propagation of trait 2
num_reps (int): Number of perumation analyses to perform. Defaults to 1000
zero_double_negatives (bool, optional): Should genes that have a negative score in both `z1` and `z2` be ignored? Defaults to True.
overlap_control (str, optional): 'bin' to permute overlapping seed genes separately, 'remove' to not consider overlapping seed genes. Any other value will do nothing. Defaults to "remove".
seed1 (list, optional): List of seed genes used to generate `z1`. Required if `overlap_control!=None`. Defaults to [].
seed2 (list, optional): List of seed genes used to generate `z2`. Required if `overlap_control!=None`. Defaults to [].
Returns:
float: The observed mean combined z-score from network colocalization
list: List of permuted mean combined z-scores
"""
if isinstance(z1, pd.Series):
z1 = pd.DataFrame(z1, columns=["z"])
if isinstance(z2, pd.Series):
z2 = pd.DataFrame(z2, columns=["z"])
z1z2 = z1.join(z2, lsuffix="1", rsuffix="2")
z1z2 = z1z2.assign(zz=z1z2.z1 * z1z2.z2)
if zero_double_negatives:
for node in z1z2.index:
if (z1z2.loc[node].z1 < 0 and z1z2.loc[node].z2 < 0):
z1z2.loc[node, 'zz'] = 0
#print(z1z2.head())
if overlap_control == "remove":
seed_overlap = list(set(seed1).intersection(set(seed2)))
print("Overlap seed genes:", len(seed_overlap))
z1z2.drop(seed_overlap, axis=0, inplace=True)
observed_mean = np.mean(z1z2.zz)
elif overlap_control == "bin":
seed_overlap = list(set(seed1).intersection(set(seed2)))
print("Overlap seed genes:", len(seed_overlap))
observed_mean = z1z2.zz.mean()
overlap_z1z2 = z1z2.loc[seed_overlap]
overlap_z1 = np.array(overlap_z1z2.z1)
z1z2.drop(seed_overlap, axis=0, inplace=True)
else:
observed_mean = np.mean(z1z2.zz)
z1 = np.array(z1z2.z1)
z2 = np.array(z1z2.z2)
permutation_means = np.zeros(num_reps)
for i in tqdm(range(num_reps)):
perm_z1z2 = np.zeros(len(z1))
np.random.shuffle(z1)
for node in range(len(z1)):
if not zero_double_negatives or not (z1[node] < 0 and z2[node] < 0):
perm_z1z2[node] = z1[node] * z2[node]
else:
perm_z1z2[node] = 0
if overlap_control == "bin":
overlap_perm_z1z2 = np.zeros(len(overlap_z1))
np.random.shuffle(overlap_z1)
for node in range(len(overlap_z1)):
if zero_double_negatives and (overlap_z1[node] < 0 and z2[node] < 0):
overlap_perm_z1z2[node] = 0
else:
overlap_perm_z1z2[node] = overlap_z1[node] * z2[node]
perm_z1z2 = np.concatenate([perm_z1z2, overlap_perm_z1z2])
permutation_means[i] = np.mean(perm_z1z2)
return observed_mean, permutation_means
[docs]
def get_p_from_permutation_results(observed, permuted, alternative='greater'):
"""Calculates the significance of the observed mean relative to the empirical normal distribution of permuted means using a one-sided test or two sided test.
Args:
observed (float): The observed value to be tested
permuted (list): List of values that make up the expected distribution
alternative (str, optional): The alternative hypothesis to test against. Can be 'greater', 'less', or 'two-sided'. Defaults to 'greater'.
Returns:
float: p-value from z-test of observed value versus the permuted distribution
"""
assert alternative in ['greater', 'less', 'two-sided'], "Alternative hypothesis must be one of 'greater', 'less', or 'two-sided'."
if alternative == 'greater':
# One-sided test: p-value is the probability of observing a value greater than or equal to the observed value
p = norm.sf((observed - np.mean(permuted)) / np.std(permuted))
elif alternative == 'less':
# One-sided test: p-value is the probability of observing a value less than or equal to the observed value
p = norm.cdf((observed - np.mean(permuted)) / np.std(permuted))
elif alternative == 'two-sided':
# Two-sided test: p-value is the probability of observing a value as extreme as the observed value in either direction
p = 2 * norm.sf(abs((observed - np.mean(permuted)) / np.std(permuted)))
try: #round to four significant figures
p = round(p, 4 - int(math.floor(math.log10(abs(p)))) - 1)
except ValueError:
if np.isnan(p):
warnings.warn('p-value is NaN')
return p
[docs]
def view_G_hier(G_hier,layout='cose'):
"""
In-notebook visualization of NetColoc hierarchy, using ipycytoscape.
:param G_hier: network to visualize. Expects output of cdapsutil.CommunityDetection(), transformed to networkx format. 'CD_MemberList_LogSize' is a required field of the network to map to the node size.
:type G: :py:class:`networkx.Graph`
:param layout: Layout method to use, any layout supported by cytoscape.js is supported. Suggest 'cose' or 'breadthfirst'.
:type layout: str
:param edge_weight_threshold: Transformed edges will be returned which have values greater than this
:type edge_weight_threshold: float
:return: Nothing
"""
# add a new field to map to node size
new_vals = nx.get_node_attributes(G_hier,'CD_MemberList_LogSize')
new_vals_keys = list(new_vals.keys())
new_vals_vals = [np.float64(v)*10 for v in list(new_vals.values())]
new_vals = dict(zip(new_vals_keys,[str(v) for v in new_vals_vals]))
new_vals
nx.set_node_attributes(G_hier,new_vals,name='CD_MemberList_LogSize_viz')
ipyviewer = ipycytoscape.CytoscapeWidget()
ipyviewer.graph.add_graph_from_networkx(G_hier)
ipyviewer.set_style([{
'selector': 'node',
'css': {
'content': 'data(name)',
'text-valign': 'center',
'color': 'white',
'text-outline-width': 2,
'text-outline-color': 'green',
'background-color': 'green',
'width': 'data(CD_MemberList_LogSize_viz)',
'height': 'data(CD_MemberList_LogSize_viz)'
}
}
])
#ipyviewer.set_layout(name='breadthfirst',directed='true')
ipyviewer.set_layout(name='cose')
display(ipyviewer)
[docs]
def calculate_network_enrichment(z_D1,z_D2,zthresh_list = list(np.arange(1,15)),z12thresh_list=[1,1.5,2],
verbose=True):
"""
Evaluate NetColoc enrichment for a range of thresholds on network proximity z-scores.
:param z_D1: DataFrame containing gene names and network proximity z-scores for the first gene set
:type z_D1: :py:class:`pandas.DataFrame`
:param z_D2: DataFrame containing gene names and network proximity z-scores for the second gene set
:type z_D2: :py:class:`pandas.DataFrame`
:param zthresh_list: list of combined z-score thresholds to iterate over
:type zthresh_list: :list
:param z12thresh_list: list of individual z-score thresholds to iterate over
:type z12thresh_list: :list
:param verbose: if True, print out some diagnostics
:type verbose: Boolean
:return: netcoloc_enrichment_df: DataFrame containing NetColoc enrichment results
:rtype: :py:class:`pandas.DataFrame`
"""
plist = []
obs_overlap_list=[]
network_exp_mean_overlap_list=[]
network_exp_std_overlap_list=[]
zlist=[]
z12list=[]
for zthresh in zthresh_list:
for z12 in z12thresh_list:
z_d1d2_size, high_z_rand = calculate_expected_overlap(
z_D1['z'],
z_D2['z'],
plot=False,
num_reps=100,
z_score_threshold=zthresh,
z1_threshold=z12,
z2_threshold=z12
)
ztemp = (z_d1d2_size - np.mean(high_z_rand)) / np.std(high_z_rand)
ptemp = norm.sf(ztemp)
plist.append(ptemp)
# save the number of overlapping genes and overlap p-value
network_num_overlap = z_d1d2_size
obs_overlap_list.append(network_num_overlap)
network_pval_overlap = ptemp
obs_exp_temp = float(z_d1d2_size) / np.mean(high_z_rand)
network_obs_exp = obs_exp_temp
network_exp_mean_overlap = np.mean(high_z_rand)
network_exp_mean_overlap_list.append(network_exp_mean_overlap)
network_exp_std_overlap = np.std(high_z_rand)
network_exp_std_overlap_list.append(network_exp_std_overlap)
zlist.append(zthresh)
z12list.append(z12)
if verbose:
print('z = '+str(zthresh))
print('z12 = '+str(z12))
print('size of network intersection = ' + str(z_d1d2_size))
print('observed size/ expected size = ' + str(obs_exp_temp))
print('p = ' + str(ptemp))
netcoloc_enrichment_df = pd.DataFrame({'z_comb':zlist,
'z_12':z12list,
'observed_overlap':obs_overlap_list,
'expected_overlap_mean':network_exp_mean_overlap_list,
'expected_overlap_std':network_exp_std_overlap_list,
'empirical_p':plist
})
netcoloc_enrichment_df['obs_exp']=netcoloc_enrichment_df['observed_overlap']/netcoloc_enrichment_df['expected_overlap_mean']
return netcoloc_enrichment_df