Source code for netcoloc.netprop

# -*- coding: utf-8 -*-

'''Functions for performing network propagation
'''

import networkx as nx
import numpy as np
import pandas as pd
import warnings



[docs]
def get_normalized_adjacency_matrix(graph, conserve_heat=True, weighted=False):
    """
    Returns normalized adjacency matrix (W'), as detailed in:

    Vanunu, Oron, et al. 'Associating genes and protein complexes with disease
    via network propagation.'


    With version `0.1.6` and newer, the :py:class:`networkx.Graph`
    can be directly passed into
    :py:func:`~netcoloc.netprop.get_individual_heats_matrix` and
    this method will be invoked to create the normalized adjacency matrix

    .. note::
        Resulting matrix from this function can be saved to a file with :py:func:`numpy.save`
        and loaded later with :py:func:`numpy.load`, but resulting file can be several gigabytes
        and take a minute or more to save/load.

        .. code-block:: python

            numpy.save('nam.npy', adjacency_matrix)
            adjacency_matrix = numpy.load('nam.npy')


    :param graph: Interactome from which to calculate normalized
            adjacency matrix.
    :type graph: :py:class:`networkx.Graph`
    :param conserve_heat: If ``True``, heat will be conserved
            (ie. the sum of the heat vector will be equal to 1),
            and the graph will be asymmetric. Otherwise, heat will
            not be conserved, and the graph will be symmetric.
    :type conserve_heat: bool
    :param weighted: If ``True``, then the graph's edge weights
            will be taken into account. Otherwise, all edge weights
            will be set to 1.
    :type weighted: bool
    :return: Square normalized adjacency matrix
    :rtype: :py:class:`numpy.ndarray`
    """
    if isinstance(graph, np.ndarray):
        graph = nx.from_numpy_array(graph)
    
    if isinstance(graph, nx.DiGraph) or isinstance(graph, nx.MultiGraph) or isinstance(graph, nx.MultiDiGraph):
        raise ValueError("Input graph must be a networkx.Graph object. Directed and MultiGraphs are not supported.")
    
    
    assert 0 not in dict(graph.degree).values(), "Graph cannot have nodes with degree=zero"
    # assert graph is nx.Graph object

    # Create graph
    if conserve_heat:
        # If conserving heat, make G_weighted a di-graph (not symmetric)
        graph_weighted = nx.DiGraph()
    else:
        # If not conserving heat, make G_weighted a simple graph (symmetric)
        graph_weighted = nx.Graph()

    # Create edge weights
    edge_weights = []
    node_to_degree_dict = dict(graph.degree)
    if weighted and not nx.is_weighted(G=graph):
        warnings.warn("Input graph is not weighted. All edge weights will be set to 1.")

    for e in graph.edges(data=True):
        v1 = e[0]
        v2 = e[1]
        deg1 = node_to_degree_dict[v1]
        deg2 = node_to_degree_dict[v2]

        if weighted and nx.is_weighted(G=graph):
            weight = e[2]['weight']
        else:
            weight = 1

        if conserve_heat:
            # created asymmetrically weighted edges - each directed edge u->v normalized by the degree of v
            edge_weights.append((v1, v2, weight / float(deg1)))
            edge_weights.append((v2, v1, weight / float(deg2)))
        else:
            # normalize single undirected edge by the degree of both endpoints as per Vanunu, Oron, et al. 2010
            edge_weights.append((v1, v2, weight / np.sqrt(deg1 * deg2)))

    # Apply edge weights to graph
    graph_weighted.add_weighted_edges_from(edge_weights)

    # Transform graph to adjacency array
    if len(graph.nodes) != len(graph_weighted):
        raise ValueError("Input graph has nodes with zero degrees. Please remove these nodes.")

    w_prime = nx.to_numpy_array(graph_weighted, nodelist=graph.nodes())

    return w_prime




[docs]
def get_individual_heats_matrix(nam_or_graph, alpha=0.5,
                                conserve_heat=True, weighted=False):
    """
    Returns the pre-calculated contributions of each individual gene in the
    interactome to the final heat of each other gene in the interactome after
    propagation.

    .. versionchanged:: 0.1.6
        In addition, to a normalized adjacency matrix, this function
        now also supports :py:class:`networkx.Graph` network as input


    If a :py:class:`networkx.Graph` network is passed in as the **nam_or_graph**
    parameter, the function :py:func:`~netcoloc.netprop.get_normalized_adjacency_matrix`
    is called to generate the normalized adjacency matrix using **conserve_heat** and
    **weighted** parameters

    .. note::
        Resulting matrix from this function can be saved to a file with :py:func:`numpy.save`
        and loaded later with :py:func:`numpy.load`, but resulting file can be several gigabytes
        and take a minute or more to save/load.

        .. code-block:: python

            numpy.save('heats_matrix.npy', w_double_prime)
            w_double_prime = numpy.load('heats_matrix.npy')


    :param nam_or_graph: square normalized
            adjacency matrix or network
    :type nam_or_graph: :py:class:`numpy.ndarray` or :py:class:`networkx.Graph`
    :param alpha: heat dissipation coefficient between 1 and 0. The
           contribution of the heat propagated from adjacent nodes in
           determining the final heat of a node, as opposed to the contribution
           from being a part of the gene set initially
    :type alpha: float
    :param conserve_heat: If ``True``, heat will be conserved
            (ie. the sum of the heat vector will be equal to 1),
            and the graph will be asymmetric. Otherwise, heat will
            not be conserved, and the graph will be symmetric.
            **NOTE:** Only applies if **nam_or_graph** is :py:class:`networkx.Graph`
    :type conserve_heat: bool
    :param weighted: If ``True``, then the graph's edge weights
            will be taken into account. Otherwise, all edge weights
            will be set to 1.
            **NOTE:** Only applies if **nam_or_graph** is :py:class:`networkx.Graph`
    :type weighted: bool
    :return: square individual heats matrix
    :rtype: :py:class:`numpy.ndarray`
    """
    assert 1 >= alpha >= 0, "Alpha must be between 0 and 1"

    nam = nam_or_graph
    if isinstance(nam_or_graph, nx.Graph):
        nam = get_normalized_adjacency_matrix(nam_or_graph,
                                              conserve_heat=conserve_heat,
                                              weighted=weighted)
    nam = np.transpose(nam)

    d_name = np.linalg.inv(np.identity(nam.shape[0]) - alpha * nam) * (1 - alpha)

    return d_name




[docs]
def network_propagation(individual_heats_matrix, nodes, seed_genes):
    """
    Implements network propagation, as detailed in:

    Vanunu, Oron, et al. 'Associating genes and protein complexes with
    disease via network propagation.'

    Using this function, the final heat of the network is calculated directly,
    instead of iteratively. This method is faster when many different
    propagations need to be performed on the same network (with different seed
    gene sets). It is slower than
    :py:func:`~netcoloc.netprop.iterative_network_propagation` for a
    single propagation.

    :param individual_heats_matrix: Square matrix that is the
                                    output of :py:func:`~netcoloc.netprop.get_individual_heats_matrix`
    :type individual_heats_matrix: :py:class:`numpy.ndarray`
    :param nodes: List of nodes in the network represented by the
            individual_heats_matrix, in the same order in which they were
            supplied to :py:func:`~netcoloc.netprop.get_individual_heats_matrix`
    :type nodes: list
    :param seed_genes: Input list of genes/nodes for intializing the heat in network propagation. 
            Any items in `seed genes` that are not present in `nodes` will be ignored.
    :type seed_genes: list
    :return: Final heat of each node after propagation, with the name
             of the nodes as the index
    :rtype: :py:class:`pandas.Series`
    """
    # Remove genes that are not in network
    seed_genes = list(np.intersect1d(nodes, seed_genes))

    # Initialize results vector
    F = np.zeros(len(nodes))

    # Add up resulting heats from each gene in seed genes set
    for gene in seed_genes:
        # TODO check that this is the correct orientation
        F += individual_heats_matrix[:,nodes.index(gene)]

    # Normalize results by number of seed genes
    F /= len(seed_genes)

    #Return as pandas series
    # TODO does this need to be a pandas series?
    return pd.Series(F, index=nodes)