Source code for netcoloc.network_colocalization

# -*- coding: utf-8 -*-

'''Functions for performing network colocalization
'''


import warnings
import logging

# External library imports
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.spatial import distance

# NEW:
import ipycytoscape
import ipywidgets as widgets
from scipy.stats import hypergeom
from scipy.stats import norm
import random as rn
from tqdm import tqdm
import math


from netcoloc.netprop_zscore import *


logger = logging.getLogger(__name__)



[docs]
def calculate_network_overlap(z_scores_1, z_scores_2, z_score_threshold=3,
                              z1_threshold=1.5, z2_threshold=1.5):
    """
    Function to determine which genes overlap. Returns a list of the
    overlapping genes

    :param z_scores_1: Result from :py:func:`~netcoloc.netprop_zscore.netprop_zscore`
                       or :py:func:`~netcoloc.netprop_zscore.calculate_heat_zscores`
                       containing the z-scores of each gene following network
                       propagation. The index consists of gene names
    :type z_scores_1: :py:class:`pandas.Series`, :py:class:`pandas.DataFrame`, :py:class:`numpy.ndarray`
    :param z_scores_2: Similar to **z_scores_1**. This and **z_scores_1**
                       must contain the same genes (ie. come from the same
                       interactome network)
    :type z_scores_2: :py:class:`pandas.Series`, :py:class:`pandas.DataFrame`, :py:class:`numpy.ndarray`
    :param z_score_threshold: threshold to determine whether a gene is
        a part of the network overlap or not. Genes with combined z-scores
        below this threshold will be discarded
    :type z_score_threshold: float
    :param z1_threshold: individual z1-score threshold to determine whether a gene is
        a part of the network overlap or not. Genes with z1-scores
        below this threshold will be discarded
    :type z1_threshold: float
    :param z2_threshold: individual z2-score threshold to determine whether a gene is
        a part of the network overlap or not. Genes with z2-scores
        below this threshold will be discarded
    :type z2_threshold: float
    :return: genes in the network overlap (genes with high combined
            z-scores)
    :rtype: list
    """
    # add a warning if z_score_threshold < z1_threshold or z2_threshold
    if z_score_threshold <= z1_threshold*z2_threshold:
        warnings.warn(f'z_score_threshold ({z_score_threshold}) is less than or equal to z1_threshold ({z1_threshold}) * z2_threshold ({z2_threshold}). '
                      f'The combined threshold will have no effect.')
    if z1_threshold != z2_threshold:
        warnings.warn(f'z1_threshold ({z1_threshold}) is not equal to z2_threshold ({z2_threshold}). '
                      f'Using the same threshold for both individual scores is recommended.')
    if isinstance(z_scores_1, pd.Series):
        z_scores_1 = z_scores_1.to_frame(name='z_scores_1')
        z_scores_2 = z_scores_2.to_frame(name='z_scores_2')
    elif isinstance(z_scores_1, np.ndarray):
        z_scores_1 = pd.DataFrame({"z_scores_1":z_scores_1})
        z_scores_2 = pd.DataFrame({"z_scores_2":z_scores_2})
    else:
        z_scores_1.columns = ["z_scores_1"]
        z_scores_2.columns = ["z_scores_2"]
        
    assert len(z_scores_1) == len(z_scores_2), f'z_scores_1 and z_scores_2 must have the same length. Got Z1 :{len(z_scores_1)} and Z2:{len(z_scores_2)}'
    z_scores_joined = z_scores_1.join(z_scores_2)
    z_scores_combined = (z_scores_joined['z_scores_1']
                        * z_scores_joined['z_scores_2']
                        * (z_scores_joined['z_scores_1'] > 0)
                        * (z_scores_joined['z_scores_2'] > 0))
    # get rid of unlikely genes which have low scores in either z1 or z2
    high_z_score_genes = z_scores_combined[
        (z_scores_combined >= z_score_threshold)
         & (z_scores_joined['z_scores_1'] > z1_threshold)
         & (z_scores_joined['z_scores_2'] > z2_threshold)
    ].index.tolist()

    return high_z_score_genes




[docs]
def calculate_network_overlap_subgraph(interactome, z_scores_1,
                                       z_scores_2, z_score_threshold=3,
                                       z1_threshold=1.5,z2_threshold=1.5):
    """
    Function to return subgraph of network intersection.

    Code to create subgraph is from
    `NetworkX subgraph documentation <https://networkx.org/documentation/stable/reference/classes/generated/networkx.Graph.subgraph.html>`__

    :param interactome: network whose subgraph will be returned
    :type interactome: :py:class:`networkx.Graph`
    :param z_scores_1: Result from :py:func:`~netcoloc.netprop_zscore.netprop_zscore`
                       or :py:func:`~netcoloc.netprop_zscore.calculate_heat_zscores`
                       containing the z-scores of each gene following network
                       propagation. The index consists of gene names
    :type z_scores_1: :py:class:`pandas.Series`
    :param z_scores_2: Similar to **z_scores_1**. This and **z_scores_1**
                       must contain the same genes (ie. come celfrom the same
                       interactome network)
    :type z_scores_2: :py:class:`pandas.Series`
    :param z_score_threshold: threshold to determine whether a gene is
        a part of the network overlap or not. Genes with combined z-scores
        below this threshold will be discarded
    :type z_score_threshold: float
    :param z1_threshold: individual z1-score threshold to determine whether a gene is
        a part of the network overlap or not. Genes with z1-scores
        below this threshold will be discarded
    :type z1_threshold: float
    :param z2_threshold: individual z2-score threshold to determine whether a gene is
        a part of the network overlap or not. Genes with z2-scores
        below this threshold will be discarded
    :type z2_threshold: float
    :return: Subgraph of the interactome containing only genes that
            are in the network intersection (genes with high combined z-scores)
    :rtype: :py:class:`networkx.Graph`
    """
    network_overlap = calculate_network_overlap(z_scores_1, z_scores_2, z_score_threshold=z_score_threshold,
                                               z1_threshold=z1_threshold,z2_threshold=z2_threshold)
    if len(network_overlap) > 0:
        assert all(node in interactome.nodes() for node in network_overlap), f'Not all nodes in the network overlap are present in the interactome. Missing nodes: {set(network_overlap) - set(interactome.nodes())}'
    # Create subgraph that has the same type as original graph
    network_overlap_subgraph = interactome.__class__()
    network_overlap_subgraph.add_nodes_from((node, interactome.nodes[node]) for node in network_overlap)
    if network_overlap_subgraph.is_multigraph():
        network_overlap_subgraph.add_edges_from((node, neighbor, key, dictionary)
            for node, neighbors in interactome.adj.items() if node in network_overlap
            for neighbor, item in neighbors.items() if neighbor in network_overlap
            for key, dictionary in item.items())
    else:
        network_overlap_subgraph.add_edges_from((node, neighbor, dictionary)
            for node, neighbors in interactome.adj.items() if node in network_overlap
            for neighbor, dictionary in neighbors.items() if neighbor in network_overlap)
    network_overlap_subgraph.graph.update(interactome.graph)

    return network_overlap_subgraph




[docs]
def calculate_expected_overlap(z_scores_1, z_scores_2, seed1=None, seed2=None,
                            z_score_threshold=3, z1_threshold=1.5,
                            z2_threshold=1.5, overlap_control=None,
                            num_reps=1000, plot=False):
    """
    Determines size of expected network overlap by randomly
    shuffling gene names

    :param z_scores_1: Result from :py:func:`~netcoloc.netprop_zscore.netprop_zscore`
                    or :py:func:`~netcoloc.netprop_zscore.calculate_heat_zscores`
                    containing the z-scores of each gene following network
                    propagation. The index consists of gene names
    :type z_scores_1: :py:class:`pandas.Series`
    :param z_scores_2: Similar to **z_scores_1**. This and **z_scores_1**
                    must contain the same genes (ie. come from the same
                    interactome network)
    :type z_scores_2: :py:class:`pandas.Series`
    :param z_score_threshold: threshold to determine whether a gene is
        a part of the network overlap or not. Genes with combined z-scores
        below this threshold will be discarded
    :type z_score_threshold: float
    :param z1_threshold: individual z1-score threshold to determine whether a gene is
        a part of the network overlap or not. Genes with z1-scores
        below this threshold will be discarded
    :type z1_threshold: float
    :param z2_threshold: individual z2-score threshold to determine whether a gene is
        a part of the network overlap or not. Genes with z2-scores
        below this threshold will be discarded
    :type z2_threshold: float
    :param num_reps: Number of repitions of randomly shuffling input z_score vectors to generate null distribution
    :type num_reps: int
    :param plot: If ``True``, distribution will be plotted
    :type plot: bool
    :return: Observed overlap size, and vector of randomized overlap sizes from permuted z_scores
    :rtype: float, np.array of floats
    """
    # Build a distribution of expected network overlap sizes by shuffling node names
    random_network_overlap_sizes = []
    if isinstance(z_scores_1, pd.Series):
        z_scores_1 = pd.DataFrame(z_scores_1, columns=["z"])           
    if isinstance(z_scores_2, pd.Series):
        z_scores_2 = pd.DataFrame(z_scores_2, columns=["z"])
    z_scores_1_copy = z_scores_1.copy()
    z_scores_2_copy = z_scores_2.copy()
    z1z2 = z_scores_1.join(z_scores_2, lsuffix="1", rsuffix="2")
    z1z2 = z1z2.assign(zz=z1z2.z1 * z1z2.z2)
    
    if overlap_control is not None:
        if overlap_control == "remove":
            seed_overlap = list(set(seed1).intersection(set(seed2)))
            print("Overlap seed genes:", len(seed_overlap))
            z1z2.drop(seed_overlap, axis=0, inplace=True)
        if overlap_control == "bin":
            seed_overlap = list(set(seed1).intersection(set(seed2)))
            print("Overlap seed genes:", len(seed_overlap))
            overlap_z1z2 = z1z2.loc[seed_overlap]
            overlap_z1 = np.array(overlap_z1z2.z1)
            overlap_z2 = np.array(overlap_z1z2.z2)
            z1z2.drop(seed_overlap, axis=0, inplace=True)
                
    
    z1 = np.array(z1z2.z1)
    z2 = np.array(z1z2.z2)
    network_overlap_size = len(calculate_network_overlap(z1, z2,
                                                    z_score_threshold=z_score_threshold,
                                                    z1_threshold=z1_threshold,
                                                    z2_threshold=z2_threshold))
    
    if overlap_control == "bin":  # if overlapping seed genes were separated add their contribution
        network_overlap_size += len(calculate_network_overlap(overlap_z1z2.z1, overlap_z1z2.z2,
                                                        z_score_threshold=z_score_threshold,
                                                        z1_threshold=z1_threshold,
                                                        z2_threshold=z2_threshold))
    
    
    
    random_network_overlap_sizes = np.zeros(num_reps)
    for i in tqdm(range(num_reps)):
        perm_z1z2 = np.zeros(len(z1))
        rn.shuffle(z1)
        perm_size = len(calculate_network_overlap(z1, z2,
                                                        z_score_threshold=z_score_threshold,
                                                        z1_threshold=z1_threshold,
                                                        z2_threshold=z2_threshold))
        if overlap_control == "bin":  # perform the separate permutation
            overlap_perm_z1z2 = np.zeros(len(overlap_z1))
            rn.shuffle(overlap_z1) 
            perm_size_overlap = len(calculate_network_overlap(overlap_z1, overlap_z2,
                                                        z_score_threshold=z_score_threshold,
                                                        z1_threshold=z1_threshold,
                                                        z2_threshold=z2_threshold))
            perm_size += perm_size_overlap  # add to the size
        random_network_overlap_sizes[i] = perm_size


    if plot:
        plt.figure(figsize=(5, 4))
        dfig = sns.histplot(random_network_overlap_sizes,
                            label='Expected network intersection size')
        plt.vlines(network_overlap_size, ymin=0, ymax=dfig.dataLim.bounds[3], color='r',
                label='Observed network intersection size')
        plt.xlabel('Size of proximal subgraph, z > ' + str(z_score_threshold),
                fontsize=16)
        plt.legend(fontsize=12)

    return network_overlap_size, random_network_overlap_sizes




[docs]
def calculate_mean_z_score_distribution(z1, z2, num_reps=1000, zero_double_negatives=True, 
                                        overlap_control="remove", seed1=[], seed2=[]):
    """Determines size of expected mean combined `z=z1*z2` by randomly shuffling gene names

    Args:
        z1 (pd.Series, pd.DataFrame): Vector of z-scores from network propagation of trait 1
        z2 (pd.Series, pd.DataFrame): Vector of z-scores from network propagation of trait 2
        num_reps (int): Number of perumation analyses to perform. Defaults to 1000
        zero_double_negatives (bool, optional): Should genes that have a negative score in both `z1` and `z2` be ignored? Defaults to True.
        overlap_control (str, optional): 'bin' to permute overlapping seed genes separately, 'remove' to not consider overlapping seed genes. Any other value will do nothing. Defaults to "remove".
        seed1 (list, optional): List of seed genes used to generate `z1`. Required if `overlap_control!=None`. Defaults to [].
        seed2 (list, optional): List of seed genes used to generate `z2`. Required if `overlap_control!=None`. Defaults to [].

    Returns:
        float: The observed mean combined z-score from network colocalization
        list: List of permuted mean combined z-scores
    """
    if isinstance(z1, pd.Series):
        z1 = pd.DataFrame(z1, columns=["z"])
    if isinstance(z2, pd.Series):
        z2 = pd.DataFrame(z2, columns=["z"])
    z1z2 = z1.join(z2, lsuffix="1", rsuffix="2")
    z1z2 = z1z2.assign(zz=z1z2.z1 * z1z2.z2)
    
    if zero_double_negatives:
        for node in z1z2.index:
            if (z1z2.loc[node].z1 < 0 and z1z2.loc[node].z2 < 0):
                z1z2.loc[node, 'zz'] = 0
    #print(z1z2.head())
    if overlap_control == "remove":
        seed_overlap = list(set(seed1).intersection(set(seed2)))
        print("Overlap seed genes:", len(seed_overlap))
        z1z2.drop(seed_overlap, axis=0, inplace=True)
        observed_mean = np.mean(z1z2.zz)
    elif overlap_control == "bin":
        seed_overlap = list(set(seed1).intersection(set(seed2)))
        print("Overlap seed genes:", len(seed_overlap))
        observed_mean = z1z2.zz.mean()
        overlap_z1z2 = z1z2.loc[seed_overlap]
        overlap_z1 = np.array(overlap_z1z2.z1)
        z1z2.drop(seed_overlap, axis=0, inplace=True)
    else:
        observed_mean = np.mean(z1z2.zz)

    z1 = np.array(z1z2.z1)
    z2 = np.array(z1z2.z2)

    permutation_means = np.zeros(num_reps)
    for i in tqdm(range(num_reps)):
        perm_z1z2 = np.zeros(len(z1))
        np.random.shuffle(z1)

        for node in range(len(z1)):
            if not zero_double_negatives or not (z1[node] < 0 and z2[node] < 0):
                perm_z1z2[node] = z1[node] * z2[node]
            else:
                perm_z1z2[node] = 0
        if overlap_control == "bin":
            overlap_perm_z1z2 = np.zeros(len(overlap_z1))
            np.random.shuffle(overlap_z1) 
            for node in range(len(overlap_z1)):
                if zero_double_negatives and (overlap_z1[node] < 0 and z2[node] < 0):
                    overlap_perm_z1z2[node] = 0
                else:
                    overlap_perm_z1z2[node] = overlap_z1[node] * z2[node]
            perm_z1z2 = np.concatenate([perm_z1z2, overlap_perm_z1z2])
                    
        permutation_means[i] = np.mean(perm_z1z2)
    return observed_mean, permutation_means





[docs]
def get_p_from_permutation_results(observed, permuted, alternative='greater'):
    """Calculates the significance of the observed mean relative to the empirical normal distribution of permuted means using a one-sided test or two sided test.

    Args:
        observed (float): The observed value to be tested
        permuted (list): List of values that make up the expected distribution
        alternative (str, optional): The alternative hypothesis to test against. Can be 'greater', 'less', or 'two-sided'. Defaults to 'greater'.
    
    Returns:
        float: p-value from z-test of observed value versus the permuted distribution
    """
    assert alternative in ['greater', 'less', 'two-sided'], "Alternative hypothesis must be one of 'greater', 'less', or 'two-sided'."
    if alternative == 'greater':
        # One-sided test: p-value is the probability of observing a value greater than or equal to the observed value
        p = norm.sf((observed - np.mean(permuted)) / np.std(permuted))
    elif alternative == 'less':
        # One-sided test: p-value is the probability of observing a value less than or equal to the observed value
        p = norm.cdf((observed - np.mean(permuted)) / np.std(permuted))
    elif alternative == 'two-sided':
        # Two-sided test: p-value is the probability of observing a value as extreme as the observed value in either direction
        p = 2 * norm.sf(abs((observed - np.mean(permuted)) / np.std(permuted)))
    try:    #round to four significant figures
        p = round(p, 4 - int(math.floor(math.log10(abs(p)))) - 1)
    except ValueError:
        if np.isnan(p):
            warnings.warn('p-value is NaN')
    return p



[docs]
def transform_edges(G, method='cosine_sim', edge_weight_threshold=0.95):
    """
    Transforms binary edges using selected method (currently only cosine similarity is implemented).
    Cosine similarity measures the similarity between neighbors of node pairs in the input network

    :param G: network whose edges will be transformed
    :type G: :py:class:`networkx.Graph`
    :param method: Method to use, only ``cosine_sim`` supported. Any other value will
                   cause this method to output a warning and immediately return
    :type method: str
    :param edge_weight_threshold: Transformed edges will be returned which have values greater than this
    :type edge_weight_threshold: float
    :return: Graph with nodes identical to input G, but with transformed edges (values > edge_weight_threshold)
    :rtype: :py:class:`networkx.Graph`
    """

    if method not in ['cosine_sim']:  # update this if we add more methods
        warnings.warn('Error: ' + method + ' method not yet implemented')
        return

    # compute the adjacency matrix
    logging.info('computing the adjacency matrix...')

    nodelist = list(G.nodes())
    # get graph as adjacency matrix
    graph_as_adj = nx.to_numpy_array(G)

    # add transpose to matrix to remove edge direction
    adj_temp = graph_as_adj + np.transpose(graph_as_adj)

    # compute the cosine similarity
    logging.info('computing the cosine similarity...')
    cos_pc = pd.DataFrame(np.zeros((adj_temp.shape[0],
                                   adj_temp.shape[1])),
                          index=nodelist)
    cos_pc.columns = nodelist

    for i in np.arange(0, len(nodelist)-1):
        n1 = nodelist[i]
        # this node has no neighbors so set
        # cosine distance to maximum distance aka 1.0
        # in cos_pc and continue
        if max(adj_temp[i]) < 1.0:
            for j in np.arange(i + 1, len(nodelist)):
                n2 = nodelist[j]
                cos_pc.loc[n1, n2] = 1.0
                cos_pc.loc[n2, n1] = 1.0
            continue
        for j in np.arange(i+1, len(nodelist)):
            n2 = nodelist[j]

            # make sure they have some neighbors
            if max(adj_temp[j]) > 0.0:
                cosine_distance = distance.cosine(adj_temp[i],
                                                  adj_temp[j])
            else:
                # no neighbors in neigh2 so set
                # cosine distance to maximum distance aka 1.0
                cosine_distance = 1.0

            cos_pc.loc[n1, n2] = cosine_distance
            cos_pc.loc[n2, n1] = cosine_distance

    # Rank transform 1-cos distance
    logger.info('rank transforming...')
    m1cos = 1-cos_pc
    m1cos = m1cos.replace(np.nan, 0)
    sim_names = m1cos.index.tolist()
    sim_rank = m1cos.rank(0) / (m1cos.shape[0] - 1)
    sim_rank = np.array(sim_rank)
    np.fill_diagonal(sim_rank,0) # remove self edges
    sim_rank = pd.DataFrame(sim_rank)
    sim_rank = pd.DataFrame((sim_rank.values + sim_rank.values.T) / 2.0,
                            columns=sim_names, index=sim_names)

    # remove self edges
    #sim_rank.values[[np.arange(sim_rank.shape[0])]*2] = 0


    sim_rank['gene_temp'] = sim_rank.index.tolist()
    sim_rank_EL = sim_rank.melt(id_vars=['gene_temp'])
    sim_rank_EL.columns = ['node1', 'node2', 'sim']
    sim_rank_EL = sim_rank_EL[sim_rank_EL['sim'] > edge_weight_threshold]
    G_transf = nx.Graph()
    G_transf.add_nodes_from(G)
    G_transf.add_weighted_edges_from(zip(sim_rank_EL['node1'],
                                         sim_rank_EL['node2'],
                                         sim_rank_EL['sim']))

    logger.info('number of transformed edges returned = ' +
                str(len(G_transf.edges())))

    return G_transf



[docs]
def view_G_hier(G_hier,layout='cose'):
    
    """
    In-notebook visualization of NetColoc hierarchy, using ipycytoscape.

    :param G_hier: network to visualize. Expects output of  cdapsutil.CommunityDetection(), transformed to networkx format. 'CD_MemberList_LogSize' is a required field of the network to map to the node size.
    :type G: :py:class:`networkx.Graph`
    :param layout: Layout method to use, any layout supported by cytoscape.js is supported. Suggest 'cose' or 'breadthfirst'.
    :type layout: str
    :param edge_weight_threshold: Transformed edges will be returned which have values greater than this
    :type edge_weight_threshold: float
    :return: Nothing
    """
    
    # add a new field to map to node size
    new_vals = nx.get_node_attributes(G_hier,'CD_MemberList_LogSize')
    new_vals_keys = list(new_vals.keys())
    new_vals_vals = [np.float64(v)*10 for v in list(new_vals.values())]
    new_vals = dict(zip(new_vals_keys,[str(v) for v in new_vals_vals]))
    new_vals
    nx.set_node_attributes(G_hier,new_vals,name='CD_MemberList_LogSize_viz')
    
    ipyviewer = ipycytoscape.CytoscapeWidget()
    ipyviewer.graph.add_graph_from_networkx(G_hier)
    ipyviewer.set_style([{
                            'selector': 'node',
                            'css': {
                                'content': 'data(name)',
                                'text-valign': 'center',
                                'color': 'white',
                                'text-outline-width': 2,
                                'text-outline-color': 'green',
                                'background-color': 'green',
                                'width': 'data(CD_MemberList_LogSize_viz)',
                                'height': 'data(CD_MemberList_LogSize_viz)'
                            }
                            }
                            ])

    #ipyviewer.set_layout(name='breadthfirst',directed='true')
    ipyviewer.set_layout(name='cose')
    display(ipyviewer)

    

[docs]
def sweep_input_pvals(D1_df,D2_df,
                      individual_heats_matrix,nodes,degrees,
                      cutoff_list = [.01,.02,.03,.04,.05,.1],
                      gene_column='gene',
                      score_column='pval',
                      cutoff_max=True,
                     num_reps=100,
                     verbose=True,
                     z_score_threshold=3,
                     z12_threshold=1.5):
    
    """
    Evaluate NetColoc enrichment for a range of thresholds on input gene lists.
    
    :param D1_df: DataFrame containing gene names and scores for the first gene set
    :type D1_df: :py:class:`pandas.DataFrame`
    :param D2_df: DataFrame containing gene names and scores for the second gene set
    :type D2_df: :py:class:`pandas.DataFrame`
    :param individual_heats_matrix: output of the
            netprop.get_individual_heats_matrix. A square matrix containing the
            final heat contributions of each gene
    :type individual_heats_matrix: :py:class:`numpy.ndarray`
    :param nodes: nodes, in the order in which they were supplied to
            the :py:func:`~netcoloc.netprop.get_normalized_adjacency_matrix` method
            which returns the precursor to the individual_heats_matrix
    :type nodes: list
    :param degrees: Mapping of node names to node degrees
    :type degrees: dict
    :param cutoff_list: list of values to threshold the input gene sets by
    :type cutoff_list: list
    :param gene_column: name of column containing genes in D1_df and D2_df
    :type gene_column: string
    :param score_column: name of column containing scores (usually p-value or log fold change) in D1_df and D2_df
    :type score_column: string
    :param cutoff_max: if True, genes will be selected which have scores less than the cutoff value. If false, genes will be selected which have scores greater than the cutoff value.
    :type cutoff_max: Boolean
    :param num_reps: Number of times the network propagation algorithm should
        be run using random seed genes in order to build the null model
    :type num_reps: int
    :param verbose: if True, print out some diagnostics
    :type verbose: Boolean
    :param z_score_threshold: threshold to determine whether a gene is
        a part of the network overlap or not. Genes with combined z-scores
        below this threshold will be discarded
    :type z_score_threshold: float
    :param z12_threshold: individual z1/z2-score threshold to determine whether a gene is
        a part of the network overlap or not. Genes with z1/z2-scores
        below this threshold will be discarded
    :type z12_threshold: float

    :return: netcoloc_pval_df: DataFrame containing NetColoc enrichment results
    :rtype: :py:class:`pandas.DataFrame`
    """
    
    D1_num_genes, D2_num_genes = [],[]
    num_shared_genes = []
    plist = []
    obs_overlap_list=[]
    network_exp_mean_overlap_list=[]
    network_exp_std_overlap_list=[]
    
    for c in cutoff_list:

        if cutoff_max==True:
            D1_genes = D1_df[D1_df[score_column]<c][gene_column].tolist()
            D2_genes = D2_df[D2_df[score_column]<c][gene_column].tolist()
        else:
            D1_genes = D1_df[D1_df[score_column]>c][gene_column].tolist()
            D2_genes = D2_df[D2_df[score_column]>c][gene_column].tolist()
            
        D1_num_genes.append(len(D1_genes))
        D2_num_genes.append(len(D2_genes))
        num_shared_genes.append(len(np.intersect1d(D1_genes,D2_genes)))
            
        # check that the gene sizes meet the input criteria of >5 and <500
        if (5<=len(D1_genes)<=500) & (5<=len(D2_genes)<=500):
            
            # D1 variant network propagation
            print('\nCalculating D1 variant z-scores: ')
            z_D1, Fnew_D1, Fnew_rand_D1 = calculate_heat_zscores(individual_heats_matrix, nodes, 
                                                                                degrees, 
                                                                                D1_genes, num_reps=num_reps)

            z_D1 = pd.DataFrame({'z':z_D1})
            z_D1.sort_values('z',ascending=False).head()

            # D2 variant network propagation
            print('\nCalculating D2 variant z-scores: ')
            z_D2, Fnew_D2, Fnew_rand_D2 = calculate_heat_zscores(individual_heats_matrix, nodes, 
                                                                                degrees, 
                                                                                D2_genes, num_reps=num_reps)

            z_D2 = pd.DataFrame({'z':z_D2})

            z_d1d2_size, high_z_rand = calculate_expected_overlap(
                z_D1['z'],
                z_D2['z'],
                plot=False,
                num_reps=num_reps,
                z_score_threshold=z_score_threshold,
                z1_threshold=z12_threshold,
                z2_threshold=z12_threshold
            )

            ztemp = (z_d1d2_size - np.mean(high_z_rand)) / np.std(high_z_rand)
            ptemp = norm.sf(ztemp)

            # save the number of overlapping genes and overlap p-value
            network_num_overlap = z_d1d2_size
            network_pval_overlap = ptemp
            
            network_exp_mean_overlap = np.mean(high_z_rand)
            network_exp_std_overlap = np.std(high_z_rand)

            
            obs_exp_temp = float(z_d1d2_size) / np.mean(high_z_rand)
            
        else:
            ptemp = np.nan
            network_num_overlap=np.nan
            network_exp_mean_overlap=np.nan
            network_exp_std_overlap=np.nan
            z_d1d2_size=np.nan
            obs_exp_temp=np.nan
            
        plist.append(ptemp)
        obs_overlap_list.append(network_num_overlap)
        network_exp_mean_overlap_list.append(network_exp_mean_overlap)
        network_exp_std_overlap_list.append(network_exp_std_overlap)
            
        if verbose:
            print('\ncutoff = '+str(c))
            print('number of D1 genes = '+str(len(D1_genes)))
            print('number of D2 genes = '+str(len(D2_genes)))
            print('number of shared genes = '+str(len(np.intersect1d(D1_genes,D2_genes))))
            
            
            print('size of network intersection = ' + str(z_d1d2_size))
            print('observed size/ expected size = ' + str(obs_exp_temp))
            print('p = ' + str(ptemp))
            



    netcoloc_pval_df = pd.DataFrame({'score_cutoff':cutoff_list,
                                           'D1_num_genes':D1_num_genes,
                                           'D2_num_genes':D2_num_genes,
                                            'num_shared_genes':num_shared_genes,
                                          'observed_overlap':obs_overlap_list,
                                          'expected_overlap_mean':network_exp_mean_overlap_list,
                                          'expected_overlap_std':network_exp_std_overlap_list,
                                           'empirical_p':plist
                                          })
    netcoloc_pval_df['obs_exp']=netcoloc_pval_df['observed_overlap']/netcoloc_pval_df['expected_overlap_mean']

            
    
    return netcoloc_pval_df





[docs]
def calculate_network_enrichment(z_D1,z_D2,zthresh_list = list(np.arange(1,15)),z12thresh_list=[1,1.5,2],
                                 verbose=True):

    """
    Evaluate NetColoc enrichment for a range of thresholds on network proximity z-scores.
    
    :param z_D1: DataFrame containing gene names and network proximity z-scores for the first gene set
    :type z_D1: :py:class:`pandas.DataFrame`
    :param z_D2: DataFrame containing gene names and network proximity z-scores for the second gene set
    :type z_D2: :py:class:`pandas.DataFrame`
    :param zthresh_list: list of combined z-score thresholds to iterate over
    :type zthresh_list: :list
    :param z12thresh_list: list of individual z-score thresholds to iterate over
    :type z12thresh_list: :list
    :param verbose: if True, print out some diagnostics
    :type verbose: Boolean
    
    :return: netcoloc_enrichment_df: DataFrame containing NetColoc enrichment results
    :rtype: :py:class:`pandas.DataFrame`
    """
    
    plist = []
    obs_overlap_list=[]
    network_exp_mean_overlap_list=[]
    network_exp_std_overlap_list=[]
    zlist=[]
    z12list=[]

    for zthresh in zthresh_list:
        for z12 in z12thresh_list:

            z_d1d2_size, high_z_rand = calculate_expected_overlap(
                z_D1['z'],
                z_D2['z'],
                plot=False,
                num_reps=100,
                z_score_threshold=zthresh,
                z1_threshold=z12,
                z2_threshold=z12
            )

            ztemp = (z_d1d2_size - np.mean(high_z_rand)) / np.std(high_z_rand)
            ptemp = norm.sf(ztemp)
            plist.append(ptemp)

            # save the number of overlapping genes and overlap p-value
            network_num_overlap = z_d1d2_size
            obs_overlap_list.append(network_num_overlap)
            network_pval_overlap = ptemp
            obs_exp_temp = float(z_d1d2_size) / np.mean(high_z_rand)
            network_obs_exp = obs_exp_temp
            network_exp_mean_overlap = np.mean(high_z_rand)
            network_exp_mean_overlap_list.append(network_exp_mean_overlap)
            network_exp_std_overlap = np.std(high_z_rand)
            network_exp_std_overlap_list.append(network_exp_std_overlap)

            zlist.append(zthresh)
            z12list.append(z12)

            if verbose:
                print('z = '+str(zthresh))
                print('z12 = '+str(z12))
                print('size of network intersection = ' + str(z_d1d2_size))
                print('observed size/ expected size = ' + str(obs_exp_temp))
                print('p = ' + str(ptemp))



    netcoloc_enrichment_df = pd.DataFrame({'z_comb':zlist,
                                           'z_12':z12list,
                                          'observed_overlap':obs_overlap_list,
                                          'expected_overlap_mean':network_exp_mean_overlap_list,
                                          'expected_overlap_std':network_exp_std_overlap_list,
                                           'empirical_p':plist
                                          })
    netcoloc_enrichment_df['obs_exp']=netcoloc_enrichment_df['observed_overlap']/netcoloc_enrichment_df['expected_overlap_mean']
    
    return netcoloc_enrichment_df