# 15 MAY 2009 # THIS VERSION IS LIMITED TO TWO PRINCIPAL COMPONENTS BY COMMENTING OUT THE EXTRA PRINCIPAL COMPONENTS. I DISCOVERED WHEN ADDING CODE FOR THE REMOVAL OF OUTLIERS THAT MISSING PRINCIPAL COMPONENT VALUES WOULD TRIGGER THE FLAG EVERY TIME. THIS WOULD LEAD TO DIVISION BY ZERO ERROR(S) BECAUSE THE PCnum VALUES WERE BEING REDUCED TO ZERO. I MAY AT SOME POINT WRITE CODE TO CATCH THIS...BUT NOT TODAY... BEGIN { # Read input as tab-delimited files. FS = "\t" # Set counters and variables i = 1 # Variable num_dis is defined as the number of different metabolic states that will form the distance matrix. # The value is set to one initially and will be increased as appropriate by the program. num_dis = 1 # Variable num_data_pts is defined as the total number of NMR samples. # This should be the same as the total number of data points plotted in PCA scores space. # The value is set to one initially and it will be increased as appropriate by the program. num_data_pts = 1 # Set the seed for the random number generator. The citation for the random number generator is Weisstein, Eric W. "Cliff Random Number Generator." From Math World - A Wolfram Web Resource. http://mathworld.wolfram.com/CliffRandomNumberGenerator.html _cliff_seed = 0.1 # Remove earlier versions of data files prior to appending new information during creation of the new version system ("rm PCavg") system ("rm PCstdev") system ("rm DistMatrix") system ("rm FlagTest") system ("rm BootTest") system ("rm BootTest1") } { # Read data from a SIMCA generated PC scores file in tab-delimited format. # Be careful with data files modified in Excel or other programs (possibly including SIMCA) as unseen end of line codes, missing tab delimiters and other hidden codes in the file may create problems in Awk. # The first record will be skipped because it contains only column label information that is not needed by the program. # data_num[num_data_pts] is the Observation ID number column from SIMCA. # data_typ[num_data_pts] is the "metabolic state" labels (second column) in the SIMCA file. # The principal component values appear starting in column #3 of the SIMCA file. if (NR > 1 ) { data_num[num_data_pts]=$1 data_typ[num_data_pts]=$2 PC1[num_data_pts]=$3 PC2[num_data_pts]=$4 # PC3[num_data_pts]=$5 # PC4[num_data_pts]=$6 # PC5[num_data_pts]=$7 # PC6[num_data_pts]=$8 # PC7[num_data_pts]=$9 # PC8[num_data_pts]=$10 num_data_pts++ } } END { # Add the PC scores values for each different metabolic state as PC1sum, PC2sum, PC3sum, PCXsum. # Count the number of score values added as PC1num, PC2num, PC3num, PCXnum. This is the number of samples run for a given metabolic state. # Count the number of different metabolic states using the variable num_dis. for (i = 1; i <= num_data_pts; ++i) { if (data_typ[i]!=data_typ[i-1] && i > 1) ++num_dis Meta_state[num_dis] = data_typ[i] PC1sum[num_dis] = (PC1sum[num_dis] + PC1[i]) ++PC1num[num_dis] PC2sum[num_dis] = (PC2sum[num_dis] + PC2[i]) ++PC2num[num_dis] # PC3sum[num_dis] = (PC3sum[num_dis] + PC3[i]) # ++PC3num[num_dis] # PC4sum[num_dis] = (PC4sum[num_dis] + PC4[i]) # ++PC4num[num_dis] # PC5sum[num_dis] = (PC5sum[num_dis] + PC5[i]) # ++PC5num[num_dis] # PC6sum[num_dis] = (PC6sum[num_dis] + PC6[i]) # ++PC6num[num_dis] # PC7sum[num_dis] = (PC7sum[num_dis] + PC7[i]) # ++PC7num[num_dis] # PC8sum[num_dis] = (PC8sum[num_dis] + PC8[i]) # ++PC8num[num_dis] } # This loop prints the metabolic state list to the screen as a check during program execution for (i = 1; i <= num_dis; ++i) { print Meta_state[i] print num_dis } # Calculate the average PC scores values for each metabolic state. # Store the values in a file called "PCavg" for (i = 1; i < num_dis; ++i) { PC1avg[i] = PC1sum[i] /PC1num[i] PC2avg[i] = PC2sum[i] /PC2num[i] # PC3avg[i] = PC3sum[i] /PC3num[i] # PC4avg[i] = PC4sum[i] /PC4num[i] # PC5avg[i] = PC5sum[i] /PC5num[i] # PC6avg[i] = PC6sum[i] /PC6num[i] # PC7avg[i] = PC7sum[i] /PC7num[i] # PC8avg[i] = PC8sum[i] /PC8num[i] print PC1avg[i], PC2avg[i], PC3avg[i], PC4avg[i], PC5avg[i], PC6avg[i], PC7avg[i], PC8avg[i] >> "PCavg" } # Calculate the standard deviation(n-1) of the PC scores values for each metabolic state. # Store the results in a file called "PCstdev". # The code below calculates the difference between the individual data point and the respective average, squares that result and then sums the squares values in preparation for the final standard deviation calculation. num_dis = 1 for (j = 1; j <= num_data_pts; ++j) { if (data_typ[j]!=data_typ[j-1] && j > 1) ++num_dis PC1deltasq[num_dis] = (PC1[j] - PC1avg[num_dis])^2 PC1deltasqsum[num_dis] = PC1deltasqsum[num_dis] + PC1deltasq[num_dis] PC2deltasq[num_dis] = (PC2[j] - PC2avg[num_dis])^2 PC2deltasqsum[num_dis] = PC2deltasqsum[num_dis] + PC2deltasq[num_dis] # PC3deltasq[num_dis] = (PC3[j] - PC3avg[num_dis])^2 # PC3deltasqsum[num_dis] = PC3deltasqsum[num_dis] + PC3deltasq[num_dis] # PC4deltasq[num_dis] = (PC4[j] - PC4avg[num_dis])^2 # PC4deltasqsum[num_dis] = PC4deltasqsum[num_dis] + PC4deltasq[num_dis] # PC5deltasq[num_dis] = (PC5[j] - PC5avg[num_dis])^2 # PC5deltasqsum[num_dis] = PC5deltasqsum[num_dis] + PC5deltasq[num_dis] # PC6deltasq[num_dis] = (PC6[j] - PC6avg[num_dis])^2 # PC6deltasqsum[num_dis] = PC6deltasqsum[num_dis] + PC6deltasq[num_dis] # PC7deltasq[num_dis] = (PC7[j] - PC7avg[num_dis])^2 # PC7deltasqsum[num_dis] = PC7deltasqsum[num_dis] + PC7deltasq[num_dis] # PC8deltasq[num_dis] = (PC8[j] - PC8avg[num_dis])^2 # PC8deltasqsum[num_dis] = PC8deltasqsum[num_dis] + PC8deltasq[num_dis] } # This code calculates the (n-1) standard deviations for each PC using the PCdeltasqsum values from above # The results are appended to the file "PCstdev". for (k = 1; k <= (num_dis - 1); ++k) { PC1stdev[k] = sqrt((1 / (PC1num[k] - 1)) * (PC1deltasqsum[k])) PC2stdev[k] = sqrt((1 / (PC2num[k] - 1)) * (PC2deltasqsum[k])) # PC3stdev[k] = sqrt((1 / (PC3num[k] - 1)) * (PC3deltasqsum[k])) # PC4stdev[k] = sqrt((1 / (PC4num[k] - 1)) * (PC4deltasqsum[k])) # PC5stdev[k] = sqrt((1 / (PC5num[k] - 1)) * (PC5deltasqsum[k])) # PC6stdev[k] = sqrt((1 / (PC6num[k] - 1)) * (PC6deltasqsum[k])) # PC7stdev[k] = sqrt((1 / (PC7num[k] - 1)) * (PC7deltasqsum[k])) # PC8stdev[k] = sqrt((1 / (PC8num[k] - 1)) * (PC8deltasqsum[k])) print PC1stdev[k], PC2stdev[k], PC3stdev[k], PC4stdev[k], PC5stdev[k], PC6stdev[k], PC7stdev[k], PC8stdev[k] >> "PCstdev" } # This code checks the entire data set for points that are outliers for their given metabolic state. # The program will exclude points that more than two standard deviations from the average value for the given principal component. num_dis = 1 for (j = 1; j <= num_data_pts; ++j) { if (data_typ[j]!=data_typ[j-1] && j > 1) ++num_dis PC1upper[num_dis] = PC1avg[num_dis] + (2 * PC1stdev[num_dis]) PC1lower[num_dis] = PC1avg[num_dis] - (2 * PC1stdev[num_dis]) if ((PC1[j] >= PC1upper[num_dis]) || (PC1[j] <= PC1lower[num_dis])) flag[j] = 1 else flag[j] = 0 print PC1[j], flag[j], PC1upper[num_dis], PC1lower[num_dis] >> "FlagTest" PC2upper[num_dis] = PC2avg[num_dis] + (2 * PC2stdev[num_dis]) PC2lower[num_dis] = PC2avg[num_dis] - (2 * PC2stdev[num_dis]) if ((PC2[j] >= PC2upper[num_dis]) || (PC2[j] <= PC2lower[num_dis])) flag[j] = 1 print PC2[j], flag[j], PC2upper[num_dis], PC2lower[num_dis] >> "FlagTest" print "\n" >> "FlagTest" # PC3upper[num_dis] = PC3avg[num_dis] + (2 * PC3stdev[num_dis]) # PC3lower[num_dis] = PC3avg[num_dis] - (2 * PC3stdev[num_dis]) # if ((PC3[j] >= PC3upper[num_dis]) || (PC3[j] <= PC3lower[num_dis])) flag[j] = 1 # print PC3[j], flag[j], PC3upper[num_dis], PC3lower[num_dis] >> "FlagTest" # print "\n" >> "FlagTest" # PC4upper[num_dis] = PC4avg[num_dis] + (2 * PC4stdev[num_dis]) # PC4lower[num_dis] = PC4avg[num_dis] - (2 * PC4stdev[num_dis]) # if ((PC4[j] >= PC4upper[num_dis]) || (PC4[j] <= PC4lower[num_dis])) flag[j] = 1 # print PC4[j], flag[j], PC4upper[num_dis], PC4lower[num_dis] >> "FlagTest" # print "\n" >> "FlagTest" # PC5upper[num_dis] = PC5avg[num_dis] + (2 * PC5stdev[num_dis]) # PC5lower[num_dis] = PC5avg[num_dis] - (2 * PC5stdev[num_dis]) # if ((PC5[j] >= PC5upper[num_dis]) || (PC5[j] <= PC5lower[num_dis])) flag[j] = 1 # print PC5[j], flag[j], PC5upper[num_dis], PC5lower[num_dis] >> "FlagTest" # print "\n" >> "FlagTest" # PC6upper[num_dis] = PC6avg[num_dis] + (2 * PC6stdev[num_dis]) # PC6lower[num_dis] = PC6avg[num_dis] - (2 * PC6stdev[num_dis]) # if ((PC6[j] >= PC6upper[num_dis]) || (PC6[j] <= PC6lower[num_dis])) flag[j] = 1 # print PC6[j], flag[j], PC6upper[num_dis], PC6lower[num_dis] >> "FlagTest" # print "\n" >> "FlagTest" # PC7upper[num_dis] = PC7avg[num_dis] + (2 * PC7stdev[num_dis]) # PC7lower[num_dis] = PC7avg[num_dis] - (2 * PC7stdev[num_dis]) # if ((PC7[j] >= PC7upper[num_dis]) || (PC7[j] <= PC7lower[num_dis])) flag[j] = 1 # print PC7[j], flag[j], PC7upper[num_dis], PC7lower[num_dis] >> "FlagTest" # print "\n" >> "FlagTest" # PC8upper[num_dis] = PC8avg[num_dis] + (2 * PC8stdev[num_dis]) # PC8lower[num_dis] = PC8avg[num_dis] - (2 * PC8stdev[num_dis]) # if ((PC8[j] >= PC8upper[num_dis]) || (PC8[j] <= PC8lower[num_dis])) flag[j] = 1 # print PC8[j], flag[j], PC8upper[num_dis], PC8lower[num_dis] >> "FlagTest" # print "\n" >> "FlagTest" } # This ocde recalcualtes the number of data points that remain for each metabolic state after the outliers have been flagged. for (i = 1; i <= (num_dis - 1); ++i) { PC1number[i] = PC1num[i] PC2number[i] = PC2num[i] # PC3number[i] = PC3num[i] # PC4number[i] = PC4num[i] # PC5number[i] = PC5num[i] # PC6number[i] = PC6num[i] # PC7number[i] = PC7num[i] # PC8number[i] = PC8num[i] } num_dis = 1 for (j = 1; j <= num_data_pts; ++j) { if (data_typ[j]!=data_typ[j-1] && j > 1) ++num_dis if (flag[j] == 1) PC1number[num_dis] = PC1number[num_dis] - 1 if (flag[j] == 1) PC2number[num_dis] = PC2number[num_dis] - 1 # if (flag[j] == 1) PC3number[num_dis] = PC3number[num_dis] - 1 # if (flag[j] == 1) PC4number[num_dis] = PC4number[num_dis] - 1 # if (flag[j] == 1) PC5number[num_dis] = PC5number[num_dis] - 1 # if (flag[j] == 1) PC6number[num_dis] = PC6number[num_dis] - 1 # if (flag[j] == 1) PC7number[num_dis] = PC7number[num_dis] - 1 # if (flag[j] == 1) PC8number[num_dis] = PC8number[num_dis] - 1 print flag[j], PC1number[num_dis], PC2number[num_dis] } # This code recalculates the average after the flagged data points have been removed. # First, several variables need to be reset to their respective starting values. for (i = 1; i <= num_dis; ++i) { PC1sum[i] = 0 PC2sum[i] = 0 # PC3sum[i] = 0 # PC4sum[i] = 0 # PC5sum[i] = 0 # PC6sum[i] = 0 # PC7sum[i] = 0 # PC8sum[i] = 0 } num_dis = 1 for (i = 1; i <= num_data_pts; ++i) { if (data_typ[i]!=data_typ[i-1] && i > 1) ++num_dis if (flag[i] == 1) continue PC1sum[num_dis] = (PC1sum[num_dis] + PC1[i]) PC2sum[num_dis] = (PC2sum[num_dis] + PC2[i]) # PC3sum[num_dis] = (PC3sum[num_dis] + PC3[i]) # PC4sum[num_dis] = (PC4sum[num_dis] + PC4[i]) # PC5sum[num_dis] = (PC5sum[num_dis] + PC5[i]) # PC6sum[num_dis] = (PC6sum[num_dis] + PC6[i]) # PC7sum[num_dis] = (PC7sum[num_dis] + PC7[i]) # PC8sum[num_dis] = (PC8sum[num_dis] + PC8[i]) } for (i = 1; i <= (num_dis - 1); ++i) { PC1avg[i] = PC1sum[i] /PC1number[i] PC2avg[i] = PC2sum[i] /PC2number[i] # PC3avg[i] = PC3sum[i] /PC3number[i] # PC4avg[i] = PC4sum[i] /PC4number[i] # PC5avg[i] = PC5sum[i] /PC5number[i] # PC6avg[i] = PC6sum[i] /PC6number[i] # PC7avg[i] = PC7sum[i] /PC7number[i] # PC8avg[i] = PC8sum[i] /PC8number[i] print PC1avg[i], PC2avg[i], PC3avg[i], PC4avg[i], PC5avg[i], PC6avg[i], PC7avg[i], PC8avg[i] >> "PCavg" } # Calculate the n x n distance martix that represents the distances between the average PCA coordinate positions for all of the metabolic states. # Distances are calculated using the standard geometric expression for ths distance between two points in space. # Save the result in a tab-delimited file named "DistMatrix" # The printf statements are arranged to provide the proper matrix formatting for use by PHYLIP software programs. printf("%s\n", (num_dis - 1)) >> "DistMatrix" for (i = 1; i <= (num_dis - 1); ++i) { printf("%-10s\t", Meta_state[i]) >> "DistMatrix" for (j = 1; j <= (num_dis - 1); ++j) { distance[i, j] = sqrt(((PC1avg[i] - PC1avg[j])^2) + ((PC2avg[i] - PC2avg[j])^2) + ((PC3avg[i] - PC3avg[j])^2) + ((PC4avg[i] - PC4avg[j])^2) + ((PC5avg[i] - PC5avg[j])^2) + ((PC6avg[i] - PC6avg[j])^2) + ((PC7avg[i] - PC7avg[j])^2) + ((PC8avg[i] - PC8avg[j])^2)) printf("%6.3f\t", distance[i, j]) >> "DistMatrix" } printf("\n") >> "DistMatrix" } printf("\n") >> "DistMatrix" # This portion of the program calculates variations of the original distance matrix for bootstrapping purposes. # # In the next for loop a random number generator is used to randomly read principal component values from a given metabolic state for calculation of a new PCsum value for that metabolic state. # After the new PCsum value is generated, a new PCavg value is calculated as was done above. # The new PCavg values are used to calculated a new variation of the original distance matrix. # Each new distance matrix is appended to the DistMatrix file. for (n = 1; n <= 99; ++n) { # This for loop resets the PCsum values to zero. for (i = 1; i <= (num_dis); ++i) { PC1sum[i] = 0 PC2sum[i] = 0 # PC3sum[i] = 0 # PC4sum[i] = 0 # PC5sum[i] = 0 # PC6sum[i] = 0 # PC7sum[i] = 0 # PC8sum[i] = 0 } # The following loops recalculate the PCsum values for each principal component 1 through 8. # For each metabolic state with n data points, a random combination of those n data points is created and used to calculate a new PCavg value for that metabolic state. # The first loop works its way through the metabolic states (represented by num_dis). # The second loop counts through the number of non-flagged data points in a given metabolic state. # The variable data_pt_sum is the total number of data points for metabolic states that have already been processed # Random[j] should not equal zero as that would use last data point from the previous dataset. # Because the integer function returns a truncated value, I had to "add 1" to the random[j] because the original coding would lose the last value in a given metabolic data set. for (i = 1; i <= (num_dis - 1); ++i) { for (j = 1; j <= PC1number[i]; ++j) { random[j] = (cliff_rand() * PC1num[i]) + 1 Random[j] = int(random[j]) print n, i, j, Random[j], PC1number[i] >> "BootTest1" if (Random[j] == 0){ j = j - 1 continue } if (Random[j] <= 0) Random[j] = -1 * Random[j] Index = data_pt_sum[i] + Random[j] if (flag[Index] == 0) { PC1sum[i] = PC1sum[i] + PC1[Index] print i, j, Index, PC1number[i], PC1[Index], flag[Index], PC1sum[i] >> "BootTest" } else { j = j - 1 } } data_pt_sum[i+1] = data_pt_sum[i] + PC1num[i] } print "\n" >> "BootTest" for (i = 1; i <= (num_dis - 1); ++i) { for (j = 1; j <= PC2number[i]; ++j) { random[j] = (cliff_rand() * PC2num[i]) + 1 Random[j] = int(random[j]) print n, i, j, Random[j], PC2number[i] >> "BootTest1" if (Random[j] == 0){ j = j - 1 continue } if (Random[j] <= 0) Random[j] = -1 * Random[j] Index = data_pt_sum[i] + Random[j] if (flag[Index] == 0) { PC2sum[i] = PC2sum[i] + PC2[Index] print i, j, Index, PC2number[i], PC2[Index], flag[Index], PC2sum[i] >> "BootTest" } else { j = j - 1 } } data_pt_sum[i+1] = data_pt_sum[i] + PC2num[i] } print "\n" >> "BootTest" for (i = 1; i <= (num_dis - 1); ++i) { for (j = 1; j <= PC3number[i]; ++j) { random[j] = (cliff_rand() * PC3num[i]) + 1 Random[j] = int(random[j]) print n, i, j, Random[j], PC3number[i] >> "BootTest1" if (Random[j] == 0){ j = j - 1 continue } if (Random[j] < 0) Random[j] = -1 * Random[j] Index = data_pt_sum[i] + Random[j] if (flag[Index] == 0) { PC3sum[i] = PC3sum[i] + PC3[Index] print i, j, Index, PC3number[i], PC3[Index], flag[Index], PC3sum[i] >> "BootTest" } else { j = j - 1 } } data_pt_sum[i+1] = data_pt_sum[i] + PC3num[i] } print "\n" >> "BootTest" for (i = 1; i <= (num_dis - 1); ++i) { for (j = 1; j <= PC4number[i]; ++j) { random[j] = (cliff_rand() * PC4num[i]) + 1 Random[j] = int(random[j]) print n, i, j, Random[j], PC4number[i] >> "BootTest1" if (Random[j] == 0){ j = j - 1 continue } if (Random[j] < 0) Random[j] = -1 * Random[j] Index = data_pt_sum[i] + Random[j] if (flag[Index] == 0) { PC4sum[i] = PC4sum[i] + PC4[Index] print i, j, Index, PC4number[i], PC4[Index], flag[Index], PC4sum[i] >> "BootTest" } else { j = j - 1 } } data_pt_sum[i+1] = data_pt_sum[i] + PC4num[i] } print "\n" >> "BootTest" for (i = 1; i <= (num_dis - 1); ++i) { for (j = 1; j <= PC5number[i]; ++j) { random[j] = (cliff_rand() * PC5num[i]) + 1 Random[j] = int(random[j]) print n, i, j, Random[j], PC5number[i] >> "BootTest1" if (Random[j] == 0){ j = j - 1 continue } if (Random[j] < 0) Random[j] = -1 * Random[j] Index = data_pt_sum[i] + Random[j] if (flag[Index] == 0) { PC5sum[i] = PC5sum[i] + PC5[Index] print i, j, Index, PC5number[i], PC5[Index], flag[Index], PC5sum[i] >> "BootTest" } else { j = j - 1 } } data_pt_sum[i+1] = data_pt_sum[i] + PC5num[i] } print "\n" >> "BootTest" for (i = 1; i <= (num_dis - 1); ++i) { for (j = 1; j <= PC6number[i]; ++j) { random[j] = (cliff_rand() * PC6number[i]) + 1 Random[j] = int(random[j]) print n, i, j, Random[j], PC6num[i] >> "BootTest1" if (Random[j] == 0){ j = j - 1 continue } if (Random[j] < 0) Random[j] = -1 * Random[j] Index = data_pt_sum[i] + Random[j] if (flag[Index] == 0) { PC6sum[i] = PC6sum[i] + PC6[Index] print i, j, Index, PC6number[i], PC6[Index], flag[Index], PC6sum[i] >> "BootTest" } else { j = j - 1 } } data_pt_sum[i+1] = data_pt_sum[i] + PC6num[i] b} print "\n" >> "BootTest" for (i = 1; i <= (num_dis - 1); ++i) { for (j = 1; j <= PC7number[i]; ++j) { random[j] = (cliff_rand() * PC7num[i]) + 1 Random[j] = int(random[j]) print n, i, j, Random[j], PC7number[i] >> "BootTest1" if (Random[j] == 0){ j = j - 1 continue } if (Random[j] < 0) Random[j] = -1 * Random[j] Index = data_pt_sum[i] + Random[j] if (flag[Index] == 0) { PC7sum[i] = PC7sum[i] + PC7[Index] print i, j, Index, PC7number[i], PC7[Index], flag[Index], PC7sum[i] >> "BootTest" } else { j = j - 1 } } data_pt_sum[i+1] = data_pt_sum[i] + PC7num[i] } print "\n" >> "BootTest" for (i = 1; i <= (num_dis - 1); ++i) { for (j = 1; j <= PC8number[i]; ++j) { random[j] = (cliff_rand() * PC8num[i]) + 1 Random[j] = int(random[j]) print n, i, j, Random[j], PC8number[i] >> "BootTest1" if (Random[j] == 0){ j = j - 1 continue } if (Random[j] < 0) Random[j] = -1 * Random[j] Index = data_pt_sum[i] + Random[j] if (flag[Index] == 0) { PC8sum[i] = PC8sum[i] + PC8[Index] print i, j, Index, PC8number[i], PC8[Index], flag[Index], PC8sum[i] >> "BootTest" } else { j = j - 1 } } data_pt_sum[i+1] = data_pt_sum[i] + PC8num[i] } print "\n" >> "BootTest" # This loop recalculates the average PC scores values for each metabolic state. # The values are appended to the file called "PCavg" for (i = 1; i < num_dis; ++i) { PC1avg[i] = PC1sum[i] /PC1number[i] PC2avg[i] = PC2sum[i] /PC2number[i] # PC3avg[i] = PC3sum[i] /PC3number[i] # PC4avg[i] = PC4sum[i] /PC4number[i] # PC5avg[i] = PC5sum[i] /PC5number[i] # PC6avg[i] = PC6sum[i] /PC6number[i] # PC7avg[i] = PC7sum[i] /PC7number[i] # PC8avg[i] = PC8sum[i] /PC8number[i] print PC1avg[i], PC2avg[i], PC3avg[i], PC4avg[i], PC5avg[i], PC6avg[i], PC7avg[i], PC8avg[i] >> "PCavg" } # Recalculate the n x n distance martix that represents the distances between the average PCA coordinate positions for all of the metabolic states. # Each new distance matrix is appended to the "DistMatrix" file. # The printf statements are arranged to provide the proper matrix formatting for use by PHYLIP software programs. printf("%s\n", (num_dis - 1)) >> "DistMatrix" for (i = 1; i <= (num_dis - 1); ++i) { printf("%-10s\t", Meta_state[i]) >> "DistMatrix" for (j = 1; j <= (num_dis - 1); ++j) { distance[i, j] = sqrt(((PC1avg[i] - PC1avg[j])^2) + ((PC2avg[i] - PC2avg[j])^2) + ((PC3avg[i] - PC3avg[j])^2) + ((PC4avg[i] - PC4avg[j])^2) + ((PC5avg[i] - PC5avg[j])^2) + ((PC6avg[i] - PC6avg[j])^2) + ((PC7avg[i] - PC7avg[j])^2) + ((PC8avg[i] - PC8avg[j])^2)) printf("%6.3f\t", distance[i, j]) >> "DistMatrix" } printf("\n") >> "DistMatrix" } printf("\n") >> "DistMatrix" } } # This improved random number generator was suggested by Bob. The reference is Weisstein, Eric W "Cliff Random Number Generator." From Math-World-A Wolfram Web Resource. http://mathworld.wolfram.com/CliffRandomNumberGenerator.html function cliff_rand() { _cliff_seed = (100 * log(_cliff_seed)) % 1 if (_cliff_seed < 0) _cliff_seed = -1 * _cliff_seed return _cliff_seed }