****************************************************************************************************** * Program: readCARToutputFromModelingCmd.sas * * Read in each CART output text file from a directory output from premodeling.cmd generated by macro * * 'writeModelingCARTcmd.sas', pull out info about each run such as splitting rule, prior, cost, * * serule, minchild, atom, and info such as CV relative cost, misclassification rate, sensitivity and * * specificity for the optimal tree for each run. Append all this info into one dataset. Extract tree * * sequence for each run and append tree sequence for all the runs into one dataset. Extract primary * * splitters in the optimal tree for each run, and append this info for all the runs into one dataset.* * Extract variable importance info corresponding to the optimal tree for each run, and append this * * info for all the runs into one dataset * * add comment about macro postProcess; * Author: Nancy Cheng * * Date: 01/13/2006 * *****************************************************************************************************; footnote 'readCARToutputFromModelingCmd.sas'; *options symbolgen mprint; * macro to extract info from one file for one run; * Parameters: * dir - file holder for output from CART modeling runs; * fname - file name of a CART output text file for one run * rsltDS1- dataset to save the various control parameter, and result about the optimal tree for the run; * treeSeqDS1- dataset to save tree sequences for the run; * primSplitDS1- dataset to save primary splitters coresponding to the optimal tree for the run; * varImpDS1- dataset to save variable importance scores coresponding to the optimal tree for the run; %macro extract1File(dir=, fname=, rsltDS1=, treeSeqDS1=, primSplitDS1=, varImpDS1=); filename cart "&dir.&fname"; data cartout; *mark some lines for data extraction; infile cart length=linelen missover; input line $varying200. linelen @1 @; *input line $varying200. linelen @1 ; * this works too; lineNo=_n_; depVarExist=index(line, 'Dependent variable'); initialMCExist=index(line, 'Initial misclassification cost'); optimalExist=index(line,'**'); * mark the line about the optimal tree; moreStarsExist=index(line,'***'); splitExist=index(line, 'A case goes'); *mark the line having a primary splitter; varImpdExist=index(line, 'VARIABLE IMPORTANCE'); mcExist=index(line, ' Misclassification Costs'); atomExist=index(line, 'Minimum size below which node'); * mark the line having atom; minCExist=index(line, 'Minimum size for a child node'); misCExist=index(line, 'Cost if classified as'); splitRuleExist=index(line, 'Construction Rule'); powerExist=index(line, 'Exponent for center weighting in split criterion'); seRuleExist=index(line, 'Tree Selection'); priorsExist=index(line, 'Priors') + index(line, 'PRIORS'); predExist=index(line, 'MISCLASSIFICATION BY CLASS'); run; data &treeSeqDS1(keep=run tree tNodes cvRC plusOrMinus se rRC cp) &primSplitDS1(rename=(optTreeCVRC=cvRC) keep=run optTreeCVRC splitter split) &varImpDS1(rename=(optTreeCVRC=cvRC) keep=run yvar optTreeCVRC xvar score) ; set cartout; length run tree tNodes $4 cvRC 3 plusOrMinus $3 se $12 rRC 3 splitter yvar xvar $20 split $50 score 3; retain depVarLine varImpLine tsTable viTable priLine misCLine predLine optTreeCVRC 0 yvar ; *file name example: run4_mc12_atom30_01mis1_10mis1.dat; p1=index(%str("&fname"), 'mc'); * extract run sequence from file name; run=substr(%str("&fname"),4, p1-5); * all the output files start with 'run'; call symput('p1', p1); call symput('run', run); *extract dependent variable; if depVarExist>0 then do; yvar=scan(line,2,':'); depVarLine=lineNo; call symput('yvar', yvar); end; if depVarLine ne 0 and lineNo = depVarLine+5 then tsTable=1; * mark the beginning of tree sequence table; if initialMCExist>0 then do; tsTable=0; end; * mark the end of tree sequence table; if tsTable=1 then do; *extract tree sequence info.; tree=scan(line,1,''); tNodes=scan(line,2,''); cvRC=input(scan(line,3,''),6.4); plusOrMinus='+/-'; se=scan(line,5,''); /*for numeric se: se=input(scan(line,5,''), E11.)*/ rRC=input(scan(line,6,''),6.4); cp=input(scan(line,7,''),8.6); output &treeSeqDS1; end; if optimalExist>0 and moreStarsExist=0 then do; * find the optimal tree; call symput('tNodes', scan(line,2, '')); * the number of Terminal Nodes for the optimal tree; optTreeCVRC=input(scan(line,3, ''), 6.4); call symput('cvRC', optTreeCVRC); *find the CVRC for the optimal tree; end; if splitExist>0 then do; *extract primary splitter info; splitter=scan(line,6,''); split=scan(line,8,''); output &primSplitDS1; end; *extract var. importance info.; if varImpdExist>0 then varImpLine=lineNo; if varImpLine ne 0 and lineNo = varImpLine+6 then viTable=1; * mark the beginning of var. importanc table; if mcExist>0 then do; viTable=0; end; * mark the end of var. importanc table; if viTable=1 then do; xvar=scan(line,1,''); score=input(scan(line,2,''),8.4); output &varImpDS1; end; if splitRuleExist>0 then do; call symput('splitRule',scan(line,3,'')); end; *extract construction rule; if powerExist>0 then do; call symput('power',input(scan(line,2,'='),6.4)); end; *extract power to tune CART away from end-cut split; %let userPriors=0; * indicator for user prior use; %let prior0=''; %let prior1=''; if priorsExist>0 then do; * extract Prior; if index(line, 'SET')>0 then do; call symput('priors', 'EQUAL'); end; if index(line, 'User')>0 then do; priLine=lineNo; call symput('userPriors', 1); end; if index(line, 'Mix')>0 or index(line, 'Data')>0 or index(line, 'Learn')>0 or index(line, 'Test')>0 then do; call symput('priors',scan(line,1,'')); end; end; if priLine ne 0 and _n_= (priLine+5) then do; call symput('prior0',trim(scan(line,2, ''))); end; if priLine ne 0 and _n_= (priLine+6) then do; call symput('prior1',scan(line,2, '')); end; %let minChild=0; * if 0 then need to extract minchild from file name; if minCExist>0 then do; * extract minchild; call symput('minChild', input(scan(line,8,''),3.0)); end; %let atom=0; * if 0 then need to extract atom from file name; if atomExist>0 then do; * extract atom; call symput('atom', input(scan(line,2,'='),4.0)); end; %let misCost01=0; * if 0 then need to misCost01 and misCost01 from file name; if misCExist>0 then misCLine=lineNo; * extract misclassification costs; if misCLine ne 0 and _n_= (misCLine+3) then do; call symput('misCost01',input(scan(line,4, ''), 5.4)); end; if misCLine ne 0 and _n_= (misCLine+4) then do; call symput('misCost10',input(scan(line,3, ''), 5.4)); end; if seRuleExist>0 then do; call symput('seRule',input(scan(line,3, ''), 6.4)); end; * extract seRule; if predExist>0 then predLine=lineNo; if predLine ne 0 and _n_=(predLine+9) then do; tmp1=scan(line,1, ''); * it has '('; tmp2=substr(tmp1,2); * extract the number for nclass0; call symput('nClass0',input(tmp2, 10.2)); call symput('nMisClass0',input(scan(line,3, ''), 10.2)); end; if predLine ne 0 and _n_=(predLine+12) then do; *similar comment to above; tmp1=scan(line,1, ''); tmp2=substr(tmp1,2); call symput('nClass1',input(tmp2, 10.2)); call symput('nMisClass1',input(scan(line,3, ''), 10.2)); end; format cvRC rRC 6.4 score 8.4 cp 8.6; label tNodes='Terminal Nodes' cvRC='Cross-Validated Relative Cost' plusOrMinus='+/-' se='Standard Error' rRC='Resubstitution Relative Cost' cp='Complexity Parameter' yvar='Dependent Variable' xvar='Predictor' score='Relative Importance'; run; data &rsltDS1; * save the info from the above macro variables to a dataset; length run tNodes $4 yvar $20 splitRule $12 power 3 priors $26 misCost01 misCost10 seRule minChild atom nClass0 nMisClass0 nClass1 nMisClass1 mCost0 mCost1 sensitivity specificity cvRC errorRate 3 dsn $68; run=symget('run'); tNodes=symget('tNodes'); yvar=symget('yvar'); %if &userPriors ne 0 %then %do;%let priors=%str(SPECIFY 0=&prior0, 1=&prior1); %end; *file name example: run4_mc12_atom30_01mis1_10mis1.dat; p1=symget('p1'); p2=index(%str("&fname"), 'atom'); p3=index(%str("&fname"), '01mis'); p4=index(%str("&fname"), '10mis'); p5=index(%str("&fname"), '.'); *if minchild is not in the output file then need to extract it from the file name; if &minChild eq 0 then do; minChild=substr(%str("&fname"),p1+2, p2-p1-3); end; *p2-p1-3=(p2-2)-(p1+2)+1; else minChild=symget('minChild'); priors=symget('priors'); if &atom eq 0 then do; atom=substr(%str("&fname"),p2+4, p3-p2-5); end; *p3-p2-5=(p3-2)-(p1+4)+1; else atom=symget('atom'); if &misCost01 eq 0 then do; misCost01=substr(%str("&fname"),p3+5, p4-p3-6); *p4-p3-6=(p4-2)-(p3+5)+1; misCost10=substr(%str("&fname"),p4+5, p5-p4-5); *p4-p3-6=(p5-1)-(p4+5)+1; end; else do; misCost01=symget('misCost01'); misCost10=symget('misCost10'); end; seRule=symget('seRule'); splitRule=symget('splitRule'); power=symget('power'); nClass0=symget('nClass0'); nClass1=symget('nClass1'); nMisClass0=symget('nMisClass0'); nMisClass1=symget('nMisClass1'); cvRC=symget('cvRC'); mCost0=nMisClass0/nClass0; mCost1=nMisClass1/nClass1; sensitivity=1-mCost0; specificity=1-mCost1; errorRate=(mCost0 + mCost1)/2; dsn=trim("&dir")||trim("&fname"); drop p1-p5; format mCost0 mCost1 errorRate sensitivity specificity cvRC 6.4; label tNodes='Terminal Nodes' yvar='Dependent Variable' splitRule='Construction Rule' cvRC='Cross-Validated Relative Cost'; run; * remove useless records; data &treeSeqDS1; set &treeSeqDS1; where tNodes ne ''; run; data &varImpDS1; set &varImpDS1; where score ne .; run; %mend extract1File; * macro to extract info from all CART output files in a directory, and save it; * Parameters: * dir = specifies the file holder for output files from CART modeling runs; * rsltDS= specifies the dataset to save the various control parameter, and result about the optimal tree of each run for all runs; * treeSeqDS= specifies thedataset to save tree sequences for all runs; * primSplitDS= specifies the dataset to save primary splitters coresponding to the optimal trees for all runs; * varImpDS= specifies the dataset to save variable importance scores coresponding to the optimal trees for all runs; * aggrVIds= specifies the dataset to save aggregated VI combining all run; %macro readCARToutputFromModelingCmd(dir=, rsltDS=, treeSeqDS=, primSplitDS=, varImpDS=, aggrVIds=); %global xNum xlist; proc datasets; delete &rsltDS &treeSeqDS &primSplitDS &varImpDS &aggrVIds; run; quit; %let dir=%formatDir(&dir); * assign a file reference (fileref) to a physical directory (dir), &rc= 0 means the operation of filename() is successful; %let rc=%sysfunc(filename(fileref,&dir)); %if &rc = 0 %then %let dirId=%sysfunc(dopen(&fileref)); *ipen the dir. and return a dir. id (>0) if the operation is successful; %if &dirId >0 %then %do; %let memCount=%sysfunc(dnum(&dirid)); * number of files in the directory; %if &memCount > 0 %then %do; %let i=1; %do %while (&i <= &memCount); * get file name for each file; %let fname=%sysfunc(dread(&dirId,&i)); *extract info from each file; %extract1File(dir=&dir, fname=&fname, rsltDS1=rsltDS1, treeSeqDS1=treeSeqDS1, primSplitDS1=primSplitDS1, varImpDS1=varImpDS1); proc datasets; append base=&rsltDS data=rsltDS1; append base=&treeSeqDS data=treeSeqDS1; append base=&primSplitDS data=primSplitDS1; append base=&varImpDS data=varImpDS1; run; quit; %let i=%eval(&i+1); %end; %end; %end; %let rc=%sysfunc(dclose(&dirId)); * Close the directory; *postProcess; proc sort data=&rsltDS out=&rsltDS; by misCost01 misCost10 errorRate; run; data varImpTmp; *exclude the runs with no tree; set &varImpDS; if cvRC ne 1; run; proc sort data=varImpTmp; by run; run; * transpose the data to make each predictor from records to variables, the results have the # of obs because of copy statement; proc transpose data=varImpTmp out=viTP0 (drop= _LABEL_ _NAME_); by run; copy cvRC; var score; id xvar; run; *only keep first record for each run since the rest of it does not have useful info; data viTP; set viTP0; by run; if first.run; run; *retrieve predictor variable names and # from viTranspose_; %getXVarNamesAndNum(ds=viTP); *get minimum of vi score for each predictor across all runs;; proc summary data=viTP; var &xlist; output out=minVI(drop=_type_ _freq_) min=&xlist; run; data _null_; set minVI; array xvars{&xNum} &xlist; do i=1 to &xNum; *find the first variable with minimum vi score >0 (as the ref., it cannot be 0), record the position in xlist; if xvars(i)>0.0 then do; call symput('refXPos', i); i=&xNum+1; *exit the loop; end; end; run; data viOneRef; set viTP; length refX $20; array xvars{&xNum} &xlist; xlist=symget('xlist'); refXPos=symget('refXPos'); refX=scan(xlist, refXPos, ''); do i=1 to &xNum; *use the above variable as a reference for all runs, and rescale all scores; xvars(i)=xvars(i)/xvars(refXPos) *100; end; drop xlist refXPos i; run; *get sum of the score for each predictor; proc summary data=viOneRef ; var &xlist; output out=sumVI(drop=_type_ _freq_) sum=&xlist; run; *aggregate VI; data aggrViTP; set sumVI; array xvars{&xNum} &xlist; maxX=xvars(1); do i=2 to &xNum; maxX=max(maxX, xvars(i)); *find the highest aggregated score; end; * rescale so that hightest score is 100; do i=1 to &xNum; xvars(i)=xvars(i)/maxX *100; end; drop maxX i; run; *convert the aggregated transposed VI back to original data form; data &aggrVIds(keep=xvar score); set aggrViTP; %let len=%length(&xlist); length xvar $20 xlist $&len; array xvars{&xNum} &xlist; xlist=symget('xlist'); do i=1 to &xNum; xvar=scan(xlist, i); score=xvars(i); output; end; format score 8.4; run; proc sort data=&aggrVIds out=&aggrVIds; by descending score; run; %mend readCARToutputFromModelingCmd; *for directory (file folder), remove the pair of single quotes if they are in the parameter dir add \ to the end of dir if there isn't one; %macro formatDir(dir); %let len=%length(&dir); %if %qsubstr(&dir, 1, 1) eq %str(%') %then %do; %let len=%eval(&len-2); %let dir=%qsubstr(&dir, 2, &len ); %if %qsubstr(&dir, &len, 1) ne %str(\) %then %str(&dir)%str(\); %else %str(&dir); %end; %else %if %qsubstr(&dir, &len, 1) ne %str(\) %then %str(&dir)%str(\); %else %str(&dir); %mend; *macro to get the number and names of predictor variables from a dataset, here it is a transposed variable importance dataset; %macro getXVarNamesAndNum(ds=); %let dsid=%sysfunc(open(&ds)); %let xlist=; %let num=%sysfunc(attrn(&dsid,nvars)); %do i=3 %to # *exclude first two vars - run and cvRC; %let xlist=&xlist %sysfunc(varname(&dsid,&i)); %end; %let rc=%sysfunc(close(&dsid)); %let xNum=%eval(&num-2); %put xNum=&xNum xlist=&xlist; %mend; *extract info from CART modeling output files; %let dir=%str('S:\testMacro\Modeling\output'); %readCARToutputFromModelingCmd(dir=&dir, rsltDS=ModelingReport, treeSeqDS=treeSeq, primSplitDS=primarySplitter, varImpDS=varImp, aggrVIds=aggrVI);