Source code for disim.stats

# -*- coding: utf-8 -*-
#
# Copyright (C) 2011 Christopher Kirkos. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

'''
Created on May 2, 2011

Statistic generation for DISim

:Author: Christopher Kirkos
'''

from __future__ import division

import numpy as np
import scikits.statsmodels as sm
import numexpr as ne
#from itertools import product
from os.path import isdir, join as pathjoin

[docs]def standardizeCoeff(A, sample=True): """Element-wise, subtract the mean of the values from each value and divide by the standard deviation. :param numpy.Array A: A numpy array. :param bool sample: Whether the `Array` `A` represents a sample or population (True if sample). :returns: A copy/view of the input Array with values standardized. :rtype: numpy.Array """ sp = 1 if sample else 0 # sample standard deviation or population? return (A-np.mean(A))/np.std(A,ddof=sp)
[docs]def optimizedMinmax(A, newMin=0.0, newMax=1.0): """An optimized version of `minmax` using `numexpr`.""" mn, mx = np.min(A), np.max(A) return ne.evaluate("(((A-mn)/(mx-mn))*(newMax-newMin))+newMin")
[docs]def minmax(A, newMin=0.0, newMax=1.0): """Min-Max normalization. :param numpy.Array A: A 1D numpy Array object. :param float newMin: The new minimum value. :param float newMax: The new maximum value. :returns: A copy/view of the input Array with values normalized. :rtype: numpy.Array """ mn, mx = np.min(A), np.max(A) return (((A-mn)/(mx-mn))*(newMax-newMin))+newMin
[docs]def runOLSRegression1997(expTrialLogFilePath, trickleDirection = "down", peripheralTieRange = (0,185), densityRange = None, withBoundaryAnalysis=False, outFilePath=None): """Perform Ordinary Least Squares (OLS) regression on the trial output data to determine which parameters have the most effect on the extent of peripheral diffusion. :param str expTrialLogFilePath: The full path to the experiment's trial log file. :param str trickleDirection: Either "up" or "down". If "up", core diffusion is selected as the dependent variable in the regression analysis. If "down", peripheral diffusion is selected as the dependent variable. :param boolean withBoundaryAnalysis: Whether or not to include the boundary weaknesses and pressure points in the multiple regression. :param tuple peripheralTieRange: Limit the regression analysis to the records where the number of peripheral ties is within (including) the specified range (min, max). If `None` is specified, then the range defaults to the entire range of the sample (no restriction). :param tuple densityRange: Limit regression analysis to the records where the peripheral density is within (including) the specified range (Eg. *(0,0.5)*). If `None` is specified, then no density restriction is placed on the records. .. note:: The original authors only simulated with the number of peripheral ties within range [0,185]. In their second set of simulations, they performed regressions on the same simulation but split the data set by density of peripheral ties. They split it in half with the first set having density <=0.5 and the second set with records having density >0.5. This is the reasoning behind the `peripheralTieRange` and `densityRange` function parameters. """ # 7 Columns (from log file): # (0) periphery ties # (1) ambiguity Ai # (2) trial # # (3) # core adopters # (4) total # core nodes # (5) # periph adopters # (6) total # periph nodes # (7) # boundary weaknesses # (8) # boundary pressure points # Test output path, create new output name if outFilePath and isdir(outFilePath): ptr = peripheralTieRange ptiesStr = "{0:d}To{1:d}PTies-".format(ptr[0],ptr[1]) if ptr else "" dr = densityRange pdensStr = "PDensity{0:.2f}To{1:.2f}-".format(dr[0],dr[1]) if dr else "" boundStr = "{0:s}Boundaries".format( "With" if withBoundaryAnalysis \ else "Without" ) fname = "Regression-{0:s}{1:s}{2:s}.txt".format(ptiesStr,pdensStr, boundStr) outFilePath = pathjoin(outFilePath,fname) # Import trial data from CSV file trialLogArray = np.genfromtxt(expTrialLogFilePath, dtype=np.float, delimiter=',') depIdx = 5 if trickleDirection=="down" else 3 diffIndepIdx = 3 if trickleDirection=="down" else 5 # y is the dependent variable, the number of peripheral adopters y = yorig = trialLogArray[:,depIdx] x1 = trialLogArray[:,1] # Ambiguity x2 = trialLogArray[:,diffIndepIdx] #/trialLogArray[:,4] # Diffusion # numexpr optimized version to calculate peripheral density x3 = optimizedCalcPeriphDensity(trialLogArray[:,0], trialLogArray[:,4], trialLogArray[:,6]) diffDepStr = "Core" if trickleDirection=="down" else "Peripheral" yname = "%s diffusion" % "Peripheral" if trickleDirection=="down" else "Core" xnames = ["Ambiguity", "%s diff." % diffDepStr, "Per. Dens."] indep = [x1,x2,x3] # independent variables if withBoundaryAnalysis: x4 = trialLogArray[:,7] # weaknesses x5 = trialLogArray[:,8] # pressure points indep.extend([x4,x5]) xnames.extend(["Weaknesses", "Press. Pnts"]) # Begin down selecting records based on input conditions pties = trialLogArray[:,0] ptr0,ptr1 = peripheralTieRange if peripheralTieRange else \ (np.min(pties), np.max(pties)) # selecting on pties, the # peripheral ties tieMask = np.ma.masked_inside(pties, ptr0, ptr1) densA = optimizedCalcNxDensity(trialLogArray[:,0], trialLogArray[:,4], trialLogArray[:,6]) dr0,dr1 = densityRange if densityRange else (np.min(densA), np.max(densA)) # Todo: ask authors which density they used? Nx or peripheral.. # (originally) selecting on x3, the peripheral density independent variable # selecting on densA, the network density densityMask = np.ma.masked_inside(densA, dr0, dr1) combinedMaskVals = tieMask.mask & densityMask.mask indep = map(lambda x: x[combinedMaskVals], indep) y = y[combinedMaskVals] yorig = yorig[combinedMaskVals] # ensure we didn't down select to 0 sized arrays for x in indep + [y,yorig]: if len(x) <= 0: return None # standardize all coefficients indep = map(standardizeCoeff, indep) y = standardizeCoeff(y) # X represents the independent control variables X = np.array(indep, dtype=np.float).transpose() olsRegression = sm.OLS(y,X) olsFit = olsRegression.fit() regstats = (olsFit, (np.mean(yorig), np.std(yorig), np.min(yorig), np.max(yorig))) if outFilePath != None: with file(outFilePath, 'w') as outFileP: outFileP.write("Regression Summary\n") outFileP.write(olsFit.summary(yname=yname, xname=xnames)) outFileP.write("\nMean: %f\nStdDev: %f\nMin: %f\nMax: %f\n" % \ (regstats[1]) ) return regstats
[docs]def possibleTies(numberOfNodes, numCoreNodes): """ Calculates the max number of ties for a network with given node statistics. Total undirected simple edges = (n*(n-1))/2 Total peripheral ties = ties between nodes in the periphery and themselves, AND ties between nodes in the periphery and in the core core. Total peripheral ties is calculated as the ... Total possible ties in the network minus the total ties between core nodes. :param integer numberOfNodes: The number of nodes in the network. :param integer numCoreNodes: The number of nodes in the core. :return: Tuple of the number of total possible ties in the network, the total possible core ties, and the total possible peripheral ties. .. todo:: Write latex set notation to represent this calculation in the docs. """ totalPossibleTies = int( (numberOfNodes*(numberOfNodes-1))/2 ) totalPossibleCoreTies = int( (numCoreNodes*(numCoreNodes-1))/2 ) totalPossiblePeriphTies = int( totalPossibleTies - totalPossibleCoreTies ) return (totalPossibleTies, totalPossibleCoreTies, totalPossiblePeriphTies)
[docs]def calcPerpipheralDensity(pties,coreNodes,periphNodes): """Calculate peripheral density, which is the number of ties in the periphery divided by the number of possible ties in the periphery. All input values are scalars, not arrays/matrices. :param integer pties: The number of ties in the periphery. :param integer coreNodes: The number of nodes in the core. :param integer periphNodes: The number of nodes in the periphery. """ totalNodes = coreNodes+periphNodes totalPossiblePeriphTies = possibleTies(totalNodes, coreNodes)[2] return pties/totalPossiblePeriphTies
[docs]def optimizedCalcPeriphDensity(pt,cn,pn): """An optimized version of `calcPeripheralDensity` using the numexpr module. All input parameters are numpy Arrays. """ tn = cn+pn # total nodes # totalPossibleTies = (tn*(tn-1))/2.0 # totalPossibleCoreTies = (cn*(cn-1))/2.0 # tot possible periph ties = totalPossibleTies - totalPossibleCoreTies # actual peripheral ties / tot possible periph ties # pt/(( (tn*(tn-1))/2.0 ) - ( (cn*(cn-1))/2.0 )) return ne.evaluate("pt/( ((tn*(tn-1))/2.0) - ((cn*(cn-1))/2.0) )")
[docs]def optimizedCalcNxDensity(pt,cn,pn): "An optimized version of `calcNxDensity`" tn = cn+pn return ne.evaluate("(pt+((cn*(cn-1))/2.0)) / ((tn*(tn-1))/2.0)")
[docs]def calcNxDensity(pties,coreNodes,periphNodes): """Calculate the density of a network with the given number of peripheral ties, core nodes, and peripheral nodes""" totalPossibleTies, totalPossibleCoreTies, totalPossiblePeriphTies = \ possibleTies(coreNodes+periphNodes, coreNodes) return (pties+totalPossibleCoreTies)/totalPossibleTies
if __name__ == "__main__": import disim basepath = '/home/prima/Development/tmp/disim/testreg' infile = 'experimentTrialLog-n31.csv' disim.fullRegressionAnalysis(basepath, infile)