Source code for disim.stats

# -*- coding: utf-8 -*-
#
# Copyright (C) 2011 Christopher Kirkos. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

'''
Created on May 2, 2011

Statistic generation for DISim

:Author: Christopher Kirkos
'''

from __future__ import division

import numpy as np
import scikits.statsmodels as sm
import numexpr as ne
#from itertools import product
from os.path import isdir, join as pathjoin

[docs]def standardizeCoeff(A, sample=True):
    """Element-wise, subtract the mean of the values from each value and 
    divide by the standard deviation.
    
    :param numpy.Array A: A numpy array.
    :param bool sample: Whether the `Array` `A` represents a sample or 
                           population (True if sample).
    :returns: A copy/view of the input Array with values standardized.
    :rtype: numpy.Array
    """
    sp = 1 if sample else 0 # sample standard deviation or population?
    return (A-np.mean(A))/np.std(A,ddof=sp)

[docs]def optimizedMinmax(A, newMin=0.0, newMax=1.0):
    """An optimized version of `minmax` using `numexpr`."""
    mn, mx = np.min(A), np.max(A)
    return ne.evaluate("(((A-mn)/(mx-mn))*(newMax-newMin))+newMin")

[docs]def minmax(A, newMin=0.0, newMax=1.0):
    """Min-Max normalization.
    
    :param numpy.Array A: A 1D numpy Array object.
    :param float newMin: The new minimum value.
    :param float newMax: The new maximum value.
    :returns: A copy/view of the input Array with values normalized.
    :rtype: numpy.Array
    """
    mn, mx = np.min(A), np.max(A)
    return (((A-mn)/(mx-mn))*(newMax-newMin))+newMin

[docs]def runOLSRegression1997(expTrialLogFilePath,
                         trickleDirection = "down",
                         peripheralTieRange = (0,185),
                         densityRange = None,
                         withBoundaryAnalysis=False,
                         outFilePath=None):
    """Perform Ordinary Least Squares (OLS) regression on the trial output
    data to determine which parameters have the most effect on the extent
    of peripheral diffusion.
    
    :param str expTrialLogFilePath: The full path to the experiment's trial
                                       log file.
    :param str trickleDirection: Either "up" or "down". If "up", core diffusion
                                 is selected as the dependent variable in the
                                 regression analysis. If "down", peripheral
                                 diffusion is selected as the dependent 
                                 variable.
    :param boolean withBoundaryAnalysis: Whether or not to include the boundary
                                         weaknesses and pressure points in the
                                         multiple regression.
    :param tuple peripheralTieRange: Limit the regression analysis to the 
                                     records where the number of peripheral
                                     ties is within (including) the specified
                                     range (min, max). If `None` is specified, 
                                     then the range defaults to the entire 
                                     range of the sample (no restriction).
    :param tuple densityRange: Limit regression analysis to the records where
                               the peripheral density is within (including)
                               the specified range (Eg. *(0,0.5)*). If `None` 
                               is specified, then no density restriction is 
                               placed on the records. 
    
    .. note:: 
        The original authors only simulated with the number of peripheral
        ties within range [0,185]. In their second set of simulations, they 
        performed regressions on the same simulation but split the data set 
        by density of peripheral ties. They split it in half with the first 
        set having density <=0.5 and the second set with records having 
        density >0.5. This is the reasoning behind the `peripheralTieRange` 
        and `densityRange` function parameters.
    
    """
    # 7 Columns (from log file):
    # (0) periphery ties 
    # (1) ambiguity Ai
    # (2) trial #
    # (3) # core adopters
    # (4) total # core nodes 
    # (5) # periph adopters
    # (6) total # periph nodes
    # (7) # boundary weaknesses
    # (8) # boundary pressure points
    
    # Test output path, create new output name
    if outFilePath and isdir(outFilePath):
        ptr = peripheralTieRange
        ptiesStr = "{0:d}To{1:d}PTies-".format(ptr[0],ptr[1]) if ptr else ""
        dr = densityRange 
        pdensStr = "PDensity{0:.2f}To{1:.2f}-".format(dr[0],dr[1]) if dr else ""
        boundStr = "{0:s}Boundaries".format( "With" if withBoundaryAnalysis \
                                         else "Without" )
        fname = "Regression-{0:s}{1:s}{2:s}.txt".format(ptiesStr,pdensStr,
                                                        boundStr)
        outFilePath = pathjoin(outFilePath,fname)
        
    # Import trial data from CSV file
    trialLogArray = np.genfromtxt(expTrialLogFilePath, dtype=np.float, 
                                  delimiter=',')
    
    depIdx = 5 if trickleDirection=="down" else 3
    diffIndepIdx = 3 if trickleDirection=="down" else 5
    
    # y is the dependent variable, the number of peripheral adopters
    y = yorig = trialLogArray[:,depIdx]
    
    x1 = trialLogArray[:,1] # Ambiguity
    
    x2 = trialLogArray[:,diffIndepIdx] #/trialLogArray[:,4] # Diffusion
    
    # numexpr optimized version to calculate peripheral density
    x3 = optimizedCalcPeriphDensity(trialLogArray[:,0], trialLogArray[:,4], 
                                    trialLogArray[:,6])
    
    
    diffDepStr = "Core" if trickleDirection=="down" else "Peripheral"
    yname = "%s diffusion" % "Peripheral" if trickleDirection=="down" else "Core"
    xnames = ["Ambiguity", "%s diff." % diffDepStr, "Per. Dens."]
    
    indep = [x1,x2,x3] # independent variables    
    if withBoundaryAnalysis:
        x4 = trialLogArray[:,7] # weaknesses
        x5 = trialLogArray[:,8] # pressure points
        indep.extend([x4,x5])
        xnames.extend(["Weaknesses", "Press. Pnts"])
    
    # Begin down selecting records based on input conditions
    pties = trialLogArray[:,0]
    ptr0,ptr1 = peripheralTieRange if peripheralTieRange else \
                                                (np.min(pties), np.max(pties)) 
    # selecting on pties, the # peripheral ties
    tieMask = np.ma.masked_inside(pties, ptr0, ptr1)
    
    densA = optimizedCalcNxDensity(trialLogArray[:,0], trialLogArray[:,4], 
                           trialLogArray[:,6])
    dr0,dr1 = densityRange if densityRange else (np.min(densA), np.max(densA))
    # Todo: ask authors which density they used? Nx or peripheral..
    # (originally) selecting on x3, the peripheral density independent variable
    # selecting on densA, the network density
    densityMask = np.ma.masked_inside(densA, dr0, dr1)
    
    combinedMaskVals = tieMask.mask & densityMask.mask

    indep = map(lambda x: x[combinedMaskVals], indep)
    y = y[combinedMaskVals]
    yorig = yorig[combinedMaskVals]
    
    # ensure we didn't down select to 0 sized arrays
    for x in indep + [y,yorig]:
        if len(x) <= 0:
            return None
    
    # standardize all coefficients
    indep = map(standardizeCoeff, indep)
    y = standardizeCoeff(y)
    
    # X represents the independent control variables
    X = np.array(indep, dtype=np.float).transpose()
    
    olsRegression = sm.OLS(y,X)
    olsFit = olsRegression.fit()
    regstats = (olsFit, (np.mean(yorig), np.std(yorig), np.min(yorig), 
                         np.max(yorig)))
    
    if outFilePath != None:
        with file(outFilePath, 'w') as outFileP:
            outFileP.write("Regression Summary\n")
            outFileP.write(olsFit.summary(yname=yname, xname=xnames))
            outFileP.write("\nMean: %f\nStdDev: %f\nMin: %f\nMax: %f\n" % \
                           (regstats[1]) )
    
    return regstats

[docs]def possibleTies(numberOfNodes, numCoreNodes):
    """ Calculates the max number of ties for a network with given node 
    statistics.
    
    Total undirected simple edges = (n*(n-1))/2
    Total peripheral ties = ties between nodes in the periphery and themselves,
    AND ties between nodes in the periphery and in the core core.
    
    Total peripheral ties is calculated as the ... 
    Total possible ties in the network minus the total ties between core nodes.
    
    :param integer numberOfNodes: The number of nodes in the network.
    :param integer numCoreNodes: The number of nodes in the core.
    :return: Tuple of the number of total possible ties in
             the network, the total possible core ties,
             and the total possible peripheral ties.
             
    .. todo:: 
        Write latex set notation to represent this calculation in the 
        docs.
    """
    totalPossibleTies = int( (numberOfNodes*(numberOfNodes-1))/2 )
    totalPossibleCoreTies = int( (numCoreNodes*(numCoreNodes-1))/2 )
    totalPossiblePeriphTies = int( totalPossibleTies - totalPossibleCoreTies )
    return (totalPossibleTies, totalPossibleCoreTies, totalPossiblePeriphTies)

[docs]def calcPerpipheralDensity(pties,coreNodes,periphNodes):
    """Calculate peripheral density, which is the number of ties in the
    periphery divided by the number of possible ties in the periphery. 
    
    All input values are scalars, not arrays/matrices.
    
    :param integer pties: The number of ties in the periphery.
    :param integer coreNodes: The number of nodes in the core.
    :param integer periphNodes: The number of nodes in the periphery.
    """
    totalNodes = coreNodes+periphNodes
    totalPossiblePeriphTies = possibleTies(totalNodes, coreNodes)[2]        
    return pties/totalPossiblePeriphTies
    
[docs]def optimizedCalcPeriphDensity(pt,cn,pn):
    """An optimized version of `calcPeripheralDensity` using the numexpr
    module.
    
    All input parameters are numpy Arrays.
    """
    tn = cn+pn # total nodes
    # totalPossibleTies = (tn*(tn-1))/2.0
    # totalPossibleCoreTies = (cn*(cn-1))/2.0
    # tot possible periph ties = totalPossibleTies - totalPossibleCoreTies
    # actual peripheral ties / tot possible periph ties  
    # pt/(( (tn*(tn-1))/2.0 ) - ( (cn*(cn-1))/2.0 ))
    return ne.evaluate("pt/( ((tn*(tn-1))/2.0) - ((cn*(cn-1))/2.0) )")


[docs]def optimizedCalcNxDensity(pt,cn,pn):
    "An optimized version of `calcNxDensity`"
    tn = cn+pn
    return ne.evaluate("(pt+((cn*(cn-1))/2.0)) / ((tn*(tn-1))/2.0)")

[docs]def calcNxDensity(pties,coreNodes,periphNodes):
    """Calculate the density of a network with the given number of peripheral
    ties, core nodes, and peripheral nodes"""
    totalPossibleTies, totalPossibleCoreTies, totalPossiblePeriphTies = \
                                possibleTies(coreNodes+periphNodes, coreNodes)
    return (pties+totalPossibleCoreTies)/totalPossibleTies

if __name__ == "__main__":
    import disim
    basepath = '/home/prima/Development/tmp/disim/testreg'
    infile = 'experimentTrialLog-n31.csv'
    disim.fullRegressionAnalysis(basepath, infile)
Navigation

Source code for disim.stats

Quick search

Navigation