******************************************************************************* * Program: readCARToutputFromPremodelingCmd.sas * * Read in CART text file output run from premodeling.cmd generated by macro * * 'writePremodelingCARTcmd.sas', pull out info about each predictor, and * * various control parameter such as splitting rule, prior, cost, * * minchild, atom, misclassification rate, sensitivity and specificity, and * * resubstitution relative cost, * * Author: Nancy Cheng * * Date: 01/6/2006 * * Mod: 8/11/2006 * ******************************************************************************; footnote 'readCARToutputFromPremodelingCmd.sas'; *macro to extract info from one output file; %macro extract1File(dir=, fname=, rsltDS1=); filename cart "&dir.&fname"; data cartout; infile cart length=linelen missover; input line $varying200. linelen @1 @; *input line $varying200. linelen @1 ; * this works too; lineNo=_n_; discreteExist=index(line, 'Discrete'); depVarExist=index(line, 'Dependent variable'); targetExist=index(line, 'Target'); desStatExist=index(line, 'Descriptive Statistics'); varImpExist=index(line, 'VARIABLE IMPORTANCE'); noTreeExist=index(line, 'No tree created'); atomExist=index(line, 'Minimum size below which node'); * find the postion of atom; minCExist=index(line, 'Minimum size for a child node'); misCExist=index(line, 'Cost if classified as'); splitExist=index(line, 'Construction Rule'); powerExist=index(line, 'Exponent for center weighting in split criterion'); priorsExist=index(line, 'Priors') + index(line, 'PRIORS'); rcExist=index(line,'Relative Cost'); * mark the line having 'Relative Cost'; predExist=index(line, 'MISCLASSIFICATION BY CLASS'); run; data _null_ ; set cartout; retain discLine targetLine desStatLine varImpLine noTreeLine priLine misCLine predLine rcLine 0 ;*yvar xvar dsn; if discreteExist>0 then discLine=lineNo; *extract first discrete variable; if discLine ne 0 and _n_=(discLine+3) then do; call symput('catvar1', scan(line,1, '')); end; if depVarExist>0 then do; call symput('yvar', trim(scan(line,2,':'))); end; if targetExist>0 then targetLine=lineNo; if targetLine ne 0 and _n_=(targetLine+3) then do; call symput('yvar', scan(line,2, ':')); end; if desStatExist>0 then desStatLine=lineNo; if desStatLine ne 0 and _n_=(desStatLine+6) then do; call symput('xvar', scan(line,1, '')); end; %let noTree=0; if noTreeExist>0 then do; noTreeLine=lineNo; call symput('noTree',1); end; * if no tree created; if noTreeLine ne 0 and _n_=(noTreeLine+14) then do; call symput('xvar_noTree', scan(line,1, '')); end; if varImpExist>0 then varImpLine=lineNo; if varImpLine ne 0 and _n_=(varImpLine+6) then do; call symput('xvar', scan(line,1, '')); end; if splitExist>0 then do; call symput('splitRule',scan(line,3,'')); end; *extract construction rule; if powerExist>0 then do; call symput('power',input(scan(line,2,'='),6.4)); end; *extract power to tune CART away from end-cut split; %let userPriors=0; * indicator for user prior use; %let prior0=''; %let prior1=''; if priorsExist>0 then do; * extract Prior; if index(line, 'SET')>0 then do; call symput('priors', 'EQUAL'); end; if index(line, 'User')>0 then do; priLine=lineNo; call symput('userPriors', 1); end; if index(line, 'Mix')>0 or index(line, 'Data')>0 or index(line, 'Learn')>0 or index(line, 'Test')>0 then do; call symput('priors',scan(line,1,'')); end; end; if priLine ne 0 and _n_= (priLine+5) then do; call symput('prior0',trim(scan(line,2, ''))); end; if priLine ne 0 and _n_= (priLine+6) then do; call symput('prior1',scan(line,2, '')); end; %let minChild=0; * if 0 then need to extract minchild from file name; if minCExist>0 then do; * extract minchild; call symput('minChild', input(scan(line,8,''),3.0)); end; %let atom=0; * if 0 then need to extract atom from file name; if atomExist>0 then do; * extract atom; call symput('atom', input(scan(line,2,'='),4.0)); end; %let misCost01=0; * if 0 then need to misCost01 and misCost01 from file name; if misCExist>0 then misCLine=lineNo; * extract misclassification costs; if misCLine ne 0 and _n_= (misCLine+3) then do; call symput('misCost01',input(scan(line,4, ''), 5.4)); end; if misCLine ne 0 and _n_= (misCLine+4) then do; call symput('misCost10',input(scan(line,3, ''), 5.4)); end; if rcExist>0 then rcLine=lineNo; if rcLine ne 0 and _n_= (rcLine+2) then do; *find the line with rc data; call symput('tNodes', scan(line,2, '')); call symput('rRC', input(scan(line,3, ''), 6.4)); end; if predExist>0 then predLine=lineNo; if predLine ne 0 and _n_=(predLine+7) then do; call symput('nClass0',input(scan(line,2, ''), 10.2)); call symput('nMisClass0',input(scan(line,4, ''), 10.2)); call symput('mCost0',input(scan(line,6, ''), 6.4)); end; if predLine ne 0 and _n_=(predLine+9) then do; call symput('nClass1', input(scan(line,2, ''), 10.2)); call symput('nMisClass1',input(scan(line,4, ''), 10.2)); call symput('mCost1',input(scan(line,6, ''), 6.4)); end; run; data &rsltDS1; * save the info from the above macro variables to a dataset; length run 3 yvar xvar $20 tNodes $16 splitRule $12 power 3 priors $32 misCost01 misCost10 minChild atom nclass0 nMisclass0 nclass1 nMisclass1 mCost0 mCost1 sensitivity specificity rRC errorRate 3 dsn $68; *file name example: run4_mc12_atom30_01mis1_10mis1.dat; p1=index(%str("&fname"), 'mc'); p2=index(%str("&fname"), 'atom'); p3=index(%str("&fname"), '01mis'); p4=index(%str("&fname"), '10mis'); p5=index(%str("&fname"), '.'); run=substr(%str("&fname"),4, p1-5); * all the output files start with 'run'; yvar=symget('yvar'); if &noTree eq 0 then do; xvar=symget('xvar'); tNodes=symget('tNodes'); end; else do; xvar=symget('xvar_noTree'); tNodes='No tree created'; end; catvar1=symget('catvar1'); *this variable either equal yvar of one of the discrete variables; if compress(catvar1) ne compress(yvar) then do; if compress(yvar) ne '' then xvar=catvar1; *for discrete variables, if there is no tree created, xvar is got from this way; else yvar=catvar1; end; *if minchild is not in the output file then need to extract it from the file name; if &minChild eq 0 then do; minChild=substr(%str("&fname"),p1+2, p2-p1-3); end; *p2-p1-3=(p2-2)-(p1+2)+1; else minChild=symget('minChild'); if &atom eq 0 then do; atom=substr(%str("&fname"),p2+4, p3-p2-5); end; *p3-p2-5=(p3-2)-(p1+4)+1; else atom=symget('atom'); %if &userPriors ne 0 %then %do;%let priors=%str(SPECIFY 0=&prior0, 1=&prior1); %end; priors=symget('priors'); if &misCost01 eq 0 then do; misCost01=substr(%str("&fname"),p3+5, p4-p3-6); *p4-p3-6=(p4-2)-(p3+5)+1; misCost10=substr(%str("&fname"),p4+5, p5-p4-5); *p4-p3-6=(p5-1)-(p4+5)+1; end; else do; misCost01=symget('misCost01'); misCost10=symget('misCost10'); end; splitRule=symget('splitRule'); power=symget('power'); rRC=symget('rRC'); nClass0=symget('nClass0'); nClass1=symget('nClass1'); nMisClass0=symget('nMisClass0'); nMisClass1=symget('nMisClass1'); mCost0=symget('mCost0'); mCost1=symget('mCost1'); sensitivity=1-mCost0; specificity=1-mCost1; errorRate=(mCost0 + mCost1)/2; dsn=trim("&dir")||trim("&fname"); *label misCost01='Misclassification cost if class 0 is classified as 1'; format mCost0 mCost1 sensitivity specificity rRC 6.4; drop p1-p5; run; %mend extract1File; *for directory (file folder), remove the pair of single quotes if they are in the parameter dir add \ to the end of dir if there isn't one; %macro formatDir(dir); %let len=%length(&dir); %if %qsubstr(&dir, 1, 1) eq %str(%') %then %do; %let len=%eval(&len-2); %let dir=%qsubstr(&dir, 2, &len ); %if %qsubstr(&dir, &len, 1) ne %str(\) %then %str(&dir)%str(\); %else %str(&dir); %end; %else %if %qsubstr(&dir, &len, 1) ne %str(\) %then %str(&dir)%str(\); %else %str(&dir); %mend; * macro to extract info from all CART output files in a directory, and save it into rsltDS; * Parameters: * dir - file holder for output from CART modeling runs; * rsltDS- dataset to save the info; %macro readCARToutputFromPremodelingCmd(dir=, rsltDS=); proc datasets; delete &rsltDS ; run; quit; %let dir=%formatDir(&dir); * assign a file reference (fileref) to a physical directory (dir), &rc= 0 means the operation of filename() is successful; %let rc=%sysfunc(filename(fileref,&dir)); %if &rc = 0 %then %let dirId=%sysfunc(dopen(&fileref)); *ipen the dir. and return a dir. id (>0) if the operation is successful; %if &dirId >0 %then %do; %let memCount=%sysfunc(dnum(&dirid)); * number of files in the directory; %if &memCount > 0 %then %do; %let i=1; %do %while (&i <= &memCount); * get file name for each file; %let fname=%sysfunc(dread(&dirId,&i)); * %put &fname; *extract info from each file; %extract1File(dir=&dir, fname=&fname, rsltDS1=rsltDS1); proc datasets; append base=&rsltDS data=rsltDS1; run; quit; %put fname=&fname; %let i=%eval(&i+1); %end; %end; %end; %let rc=%sysfunc(dclose(&dirId)); * Close the directory; %mend; ******* test *******; *extract info from premodeling; %let dir=%str('S:\testMacro\Premodeling\output'); %readCARToutputFromPremodelingCmd(dir=&dir, rsltDS=PremodelingReport); %let dir=%str('S:\testMacro\Premodeling\output3'); %readCARToutputFromPremodelingCmd(dir=&dir, rsltDS=PremodelingReport2);