********************************************************************************************* * Program: writeModelingCARTcmd.sas; * * Write a batch cmd file for CART to run models with predictors selected from premodeling * * with varying control parameters such as splitting rules, priors, cost, serule, minChild * * and atom (make sure minChild <= atom/2). * * referenece code: see stuart's writemarscmd.sas; * * Author: Nancy Cheng * * Date: 01/12/2006 * *********************************************************************************************; * A sample of CART cmd file is as below: LOPTIONS MEANS = NO, PLOTS = NO, TIMING = NO, NOPRINT=NO FORMAT = 4/UNDERFLOW LIMIT MINCHILD = 15, ATOM = 30 USE "S:\FV\New Dataset\fv_dc274.sas7bdat" MODEL DFS0 KEEP AGEMOS, ANFAMINC, DDSP5000, ETHGP2, HDSTRT01_, INSCAT CATEGORY DFS0 INSCAT Misclassify Cost = 2 Classify 0 as 1 Misclassify Cost = 1 Classify 1 as 0 OUTPUT 'S:\FV\CART\OUTPUT\OUT1_mc15.DAT' METHOD GINI POWER = 0.0000 BUILD MODEL DFS0 KEEP AGEMOS, ANFAMINC, DDSP5000, ETHGP2, HDSTRT01_, INSCAT PRIORS MIX CATEGORY DFS0 INSCAT OUTPUT 'S:\FV\CART\OUTPUT\OUT2_mc15.DAT' BOPTIONS SERULE = 0.5 METHOD GINI POWER = 0.0000 BUILD *; footnote 'writeModelingCARTcmd.sas'; *options symbolgen mprint; * macro to generate CART cmd file ; * Parameters: * firstCall - specifies if this macro is called for the first time in a SAS session, 1=yes, 0=no * cmdfile - external file used to save the commands, including the whole path * usefile - specifies the input data file * yvar - response variable; * xlist - a list of predictors (can be only one), separated by comma or space * catVars - categorical variables * minChild - minimum # of obs allowed in a child node, default=1. * atom - minimum size below which a node will not be split, default=10, note minChild <= atom/2, otherwise CART will set minChild=atom/2; * priors - Prior Class Probabilities (EQUAL, DATA, LEARN, TEST, MIX or SPECIFY 0=, 1= ), EQUAL is default and best chance of good results * misCost01 - misclassification cost if class 0 is misclassified as 1 * misCost10 - misclassification cost if class 1 is misclassified as 0 * seRule - the # of standard errors to be used to select the optimal tree (the smallest tree within the # std. err. of the min. error tree is seleced) * splitRule - splitting rules, GINI is default. * power - used with splitting rules to tune CART away from end-cut split * outDir - absoluate directory for output file * weightVar - weight variable; %macro writeModelingCARTcmd (firstCall=, cmdfile=, usefile=, yvar=, xlist=, catVars=, minChild=, atom=, priors=, misCost01=, misCost10=, seRule=, splitRule=, power=, outDir=, weightVar=); %if &firstCall=1 %then %do; %global runNo memCount; * memCount records the number of files in outDir directory; %let runNo=0; *run no.; %if %qsubstr(&cmdfile, 1, 1) ne %str(%') %then %let cmdfile=%str(%')%str(&cmdfile)%str(%'); filename cartcode &cmdfile; data _null_; file cartcode mod; line1='LOPTIONS MEANS = YES, PLOTS = NO, TIMING = NO, NOPRINT'; put line1; line2='FORMAT = 4/UNDERFLOW'; put line2; %if %qsubstr(&usefile, 1, 1) ne %str(%') %then %let usefile=%str(%')%str(&usefile)%str(%'); %let l3=%str(USE &usefile); line3=resolve('&l3'); put line3; run; %end; data _null_; file cartcode mod; /*if both minChild and atom are not specified, then use the default values for both;*/ %if &minChild=%str( ) and &atom = %str( ) %then %do; %let minChild=1; %let atom=10; line1=''; %end; /*if only minChild is not specified, then use the default value for minChild;*/ %else %if &minChild=%str( ) and &atom ne %str( ) %then %do; line1='LIMIT ATOM = '||trim("&atom"); %let minChild=1; %end; /*if only atom is not specified, then use the default value for atom;*/ %else %if &minChild ne %str( ) and &atom = %str( ) %then %do; line1='LIMIT MINCHILD = '||trim("&minChild"); %let atom=10; %end; /*both minChild and atom are specified;*/ %else %do; line1='LIMIT MINCHILD = '||trim("&minChild")||', ATOM = '||trim("&atom"); %end; if line1 ne '' then do; put line1; end; line2='MODEL '||trim("&yvar"); put line2; run; *write keep statement with predictors; %writeLongVarList(fileref=cartcode, vars=&xlist, keep=1); data _null_; file cartcode mod; %if &priors ne %str( ) %then %do; line='PRIORS '||trim("&priors"); put line; %end; run; *write category statement with a list of variables; %writeLongVarList(fileref=cartcode, vars=&catVars, keep=0); * get # of output files in the output directory (CART runs); %fileNumInDir(dir=&outDir); %if &runNo=0 %then %let runNo=%eval(&memCount+1); data _null_; file cartcode mod; %if &misCost01=%str( ) and &misCost10 = %str( ) %then %do; %let misCost01=1; %let misCost10=1; line1=''; line2=''; %end; %else %if &misCost01=%str( ) and &misCost10 ne %str( ) %then %do; %let misCost01=1; line2='Misclassify Cost = '||trim("&misCost10")||' Classify 1 as 0'; put line2; %end; %else %if &misCost01 ne %str( ) and &misCost10 = %str( ) %then %do; %let misCost10=1; line1='Misclassify Cost = '||trim("&misCost01")||' Classify 0 as 1'; put line1; %end; %else %do; line1='Misclassify Cost = '||trim("&misCost01")||' Classify 0 as 1'; put line1; line2='Misclassify Cost = '||trim("&misCost10")||' Classify 1 as 0'; put line2; %end; %let dir=%formatDir(&outDir); %let l3=%str(OUTPUT %'&dir)%str(run&runNo)%str(_mc&minChild)%str(_atom&atom)%str(_01mis&misCost01)%str(_10mis&misCost10)%str(.dat%'); line3=resolve('&l3'); put line3; %if &seRule = %str( ) %then %let seRule=0; line4='BOPTIONS SERULE = '||trim("&seRule"); put line4; %if &splitRule=%str( ) %then %let splitRule=%str(GINI); %if &power = %str( ) %then %let power=0.0; line5='METHOD '||trim("&splitRule")||' POWER='||trim("&power"); put line5; run; %if &weightVar ne %str() %then %do; data _null_; file cartcode mod; line='WEIGHT '||trim("&weightVar"); put line; run; %end; data _null_; file cartcode mod; line='BUILD'; put line; run; %let runNo=%eval(&runNo+1); %mend; *macro to get the number of files in a directory; %macro fileNumInDir(dir=); %let rc=%sysfunc(filename(fileref,&dir)); %if &rc = 0 %then %let dirId=%sysfunc(dopen(&fileref)); *ipen the dir. and return a dir. id (>0) if the operation is successful; %if &dirId >0 %then %do; %let memCount=%sysfunc(dnum(&dirid)); * number of files in the directory; %end; %let rc=%sysfunc(dclose(&dirId)); * Close the directory; %mend; *macro to format file folder input For directory (file folder), remove the pair of single quotes if they are in the parameter dir add \ to the end of dir if there isn't one; %macro formatDir(dir); %let len=%length(&dir); %if %qsubstr(&dir, 1, 1) eq %str(%') %then %do; %let len=%eval(&len-2); %let dir=%qsubstr(&dir, 2, &len ); %if %qsubstr(&dir, &len, 1) ne %str(\) %then %str(&dir)%str(\); %else %str(&dir); %end; %else %if %qsubstr(&dir, &len, 1) ne %str(\) %then %str(&dir)%str(\); %else %str(&dir); %mend; * macro to write long list of vars for KEEP or CATEGORY statement , break it down into lines of size 85 or less, make sure SAS doesn't write a variable name into 2 lines, otherwise CART won't recognize thus can't run; %macro writeLongVarList(fileref=, vars=, keep=); data _null_; file &fileref mod; %let varsLen=%length(&vars); %put varsLen=&varsLen; %let len=85; %let i=1; %do %while (&varsLen > 85); %let list=%qsubstr(&vars, 1, &len); %let lastChar=%qsubstr(&list, &len); %do %while (&lastChar ne %str(,) and &lastChar ne %str( )); %let len=%eval(&len-1); %let list=%qsubstr(&vars, 1, &len); %let lastChar=%qsubstr(&list, &len); %end; %if &i=1 %then %do; %if &keep=1 %then %do; line&i='KEEP '||trim("&list"); %end; %else %do; line&i='CATEGORY '||trim("&list"); %end; %end; %else %do; line&i=' '||trim("&list"); %end; put line&i; %let start=%eval(&len+1); %let len=85; %let vars=%qsubstr(&vars, &start); %let varsLen=%length(&vars); %let i=%eval(&i+1); %end; %let list=&vars; %if &i=1 %then %do; %if &keep=1 %then %do; line&i='KEEP '||trim("&list"); %end; %else %do; line&i='CATEGORY '||trim("&list"); %end; %end; %else %do; line&i=' '||trim("&list"); %end; put line&i; run; %mend; ************test****************; %let cmdfile=%str('S:\testMacro\modeling.cmd'); %let outDir=%str(S:\testMacro\modeling\output); %let usefile=%str('S:\testMacro\fv_ctr115.sas7bdat'); %let xlist=%str(AGEMOS, ANFAMINC, COFEED01, DCTOTAL3, DDSP5000, ETHGP2, INSCAT, L10LAC1, L10MUT1, MOMEDU3); %let catVars=%str(DFS0, INSCAT); %writeModelingCARTcmd (firstCall=1, cmdfile=&cmdfile, usefile=&usefile, yvar=DFS0, xlist=&xlist, catVars=&catVars, minChild=12, atom=30, priors=EQUAL, misCost01=1, misCost10=1, serule=0.0, splitRule=GINI, power=0.0, outDir=&outDir, weightVar=); %writeModelingCARTcmd (firstCall=0, cmdfile=&cmdfile, usefile=&usefile, yvar=DFS0, xlist=&xlist, catVars=&catVars, minChild=12, atom=30, priors=EQUAL, misCost01=1, misCost10=1, serule=0.0, splitRule=GINI, power=0.5, outDir=&outDir, weightVar=); %writeModelingCARTcmd (firstCall=0, cmdfile=&cmdfile, usefile=&usefile, yvar=DFS0, xlist=&xlist, catVars=&catVars, minChild=12, atom=30, priors=EQUAL, misCost01=1, misCost10=1, serule=0.0, splitRule=GINI, power=1, outDir=&outDir, weightVar=); %writeModelingCARTcmd (firstCall=0, cmdfile=&cmdfile, usefile=&usefile, yvar=DFS0, xlist=&xlist, catVars=&catVars, minChild=12, atom=30, priors=EQUAL, misCost01=1, misCost10=1, serule=0.0, splitRule=TWOING, power=0.0, outDir=&outDir, weightVar=); %writeModelingCARTcmd (firstCall=0, cmdfile=&cmdfile, usefile=&usefile, yvar=DFS0, xlist=&xlist, catVars=&catVars, minChild=12, atom=30, priors=EQUAL, misCost01=1, misCost10=1, serule=0.0, splitRule=TWOING, power=0.5, outDir=&outDir, weightVar=); %writeModelingCARTcmd (firstCall=0, cmdfile=&cmdfile, usefile=&usefile, yvar=DFS0, xlist=&xlist, catVars=&catVars, minChild=12, atom=30, priors=EQUAL, misCost01=1, misCost10=1, serule=0.0, splitRule=TWOING, power=1, outDir=&outDir, weightVar=); %writeModelingCARTcmd (firstCall=0, cmdfile=&cmdfile, usefile=&usefile, yvar=DFS0, xlist=&xlist, catVars=&catVars, minChild=12, atom=30, priors=EQUAL, misCost01=1, misCost10=1, serule=0.0, splitRule=GINI, power=0.0, outDir=&outDir, weightVar=); %writeModelingCARTcmd (firstCall=0, cmdfile=&cmdfile, usefile=&usefile, yvar=DFS0, xlist=&xlist, catVars=&catVars, minChild=12, atom=30, priors=EQUAL, misCost01=1, misCost10=1, serule=0.5, splitRule=GINI, power=0.5, outDir=&outDir, weightVar=); %writeModelingCARTcmd (firstCall=0, cmdfile=&cmdfile, usefile=&usefile, yvar=DFS0, xlist=&xlist, catVars=&catVars, minChild=12, atom=30, priors=EQUAL, misCost01=1, misCost10=1, serule=1.0, splitRule=GINI, power=1, outDir=&outDir, weightVar=); %writeModelingCARTcmd (firstCall=0, cmdfile=&cmdfile, usefile=&usefile, yvar=DFS0, xlist=&xlist, catVars=&catVars, minChild=12, atom=30, priors=%STR(SPECIFY 0=.4, 1=.6), misCost01=1, misCost10=1, serule=0.0, splitRule=GINI, power=0.0, outDir=&outDir, weightVar=); %writeModelingCARTcmd (firstCall=0, cmdfile=&cmdfile, usefile=&usefile, yvar=DFS0, xlist=&xlist, catVars=&catVars, minChild=12, atom=30, priors=%STR(SPECIFY 0=.4, 1=.6), misCost01=1, misCost10=1, serule=0.0, splitRule=GINI, power=0.5, outDir=&outDir, weightVar=); %writeModelingCARTcmd (firstCall=0, cmdfile=&cmdfile, usefile=&usefile, yvar=DFS0, xlist=&xlist, catVars=&catVars, minChild=12, atom=30, priors=%STR(SPECIFY 0=.4, 1=.6), misCost01=1, misCost10=1, serule=0.0, splitRule=GINI, power=1, outDir=&outDir, weightVar=); %writeModelingCARTcmd (firstCall=0, cmdfile=&cmdfile, usefile=&usefile, yvar=DFS0, xlist=&xlist, catVars=&catVars, minChild=10, atom=25, priors=EQUAL, misCost01=1, misCost10=1, serule=0.0, splitRule=GINI, power=0.0, outDir=&outDir, weightVar=); %writeModelingCARTcmd (firstCall=0, cmdfile=&cmdfile, usefile=&usefile, yvar=DFS0, xlist=&xlist, catVars=&catVars, minChild=15, atom=30, priors=EQUAL, misCost01=1, misCost10=1, serule=0.0, splitRule=GINI, power=0.0, outDir=&outDir, weightVar=); %writeModelingCARTcmd (firstCall=0, cmdfile=&cmdfile, usefile=&usefile, yvar=DFS0, xlist=&xlist, catVars=&catVars, minChild=10, atom=20, priors=EQUAL, misCost01=1, misCost10=1, serule=0.0, splitRule=GINI, power=0.0, outDir=&outDir, weightVar=); %let cmdfile=%str('S:\testMacro\modeling2.cmd'); %let outDir=%str(S:\testMacro\modeling\output); %let usefile=%str(S:\FV\New Dataset\fv_ctr97.sas7bdat); %let xlist=%str(AGEMOS, ANFAMINC, ANTIBI1Y_, ANTIBPRG_, BIRTHWT, BOTHPARS, BRELSE01,BRSELF01, BRSTFD01, BRTHORD, CHELSEN, CHSELFN, COFEED01_, DAYCAR01_, DAYOFF01_, DCTOTAL3, DDSP5000, DHCRT_PCT, DKCRT_PCT, ECCBOT01_, ETHGP2, EVRBOT01, FLOPASTE_, FLOSUPPL_, FPPM1, FQPASTE, FRSVIS01, GH1, GH2, HARMSW01, HDSTRT01_, INSCAT, IRONDEF, L10LAC1, L10MUT1, MALE01, MOMAGE_, MOMEDU3, MOMOCC_, MOMPBEXP_, MOMWKHR, NCHILDRN_, NDAYSLPB_, NMEALDAY, NSNKAFT, NSNKDAY, NSNKEVE, NSNKMORN, NTEETH, PDECAY01_, PDOPAIN, PTRTDEC_, SFGH01, STILLB01, SWTBDNOW, SWTBED12, USALAD, WHYCHANC, WHYFAR, WHYNONEE, WHYPAIN, WHYYOUNG, XASNPOPCY, XBLKPOPCY, XHISPOPCY, XHL25KCY, XP0TO5CY); %let catVars=%str(DFS0 ETHGP2 INSCAT MOMOCC_ PDOPAIN); %writeModelingCARTcmd (firstCall=1, cmdfile=&cmdfile, usefile=&usefile, yvar=DFS0, xlist=&xlist, catVars=&catVars, minChild=12, atom=30, priors=EQUAL, misCost01=1, misCost10=1, serule=0.0, splitRule=GINI, power=0.0, outDir=&outDir, weightVar=); %let cmdfile=%str('S:\testMacro\modeling3.cmd'); %let outDir=%str(S:\testMacro\modeling\output3); %let usefile=%str('S:\testMacro\fv_ctr115.sas7bdat'); %let xlist=%str(AGEMOS, ANFAMINC, ANTIBI1Y$, BIRTHWT, BRSTFD01, BRTHORD, CHSELFN, COFEED01_, DAYOFF01, DCTOTAL3, DDSP5000, DHCRT_PCT, DKCRT_PCT, ETHGP2, EVRBOT01, FLOPASTE_, FPPM1, FQPASTE, FRSVIS01, GH1, GH2, INSCAT, IRONDEF, L10LAC1, L10MUT1, MOMAGE_, MOMEDU3, MOMEMP, MOMOCC_, MOMPBEXP, MOMWKHR, NCHILDRN_, NMEALDAY, NSNKAFT, NSNKDAY, NSNKEVE, NTEETH, PDECAY01, PDOPAIN, PLAST_VI, PTRTDEC_, STILLB01, SWTBDNOW, SWTBED12, USALAD, WHYCHANC, WHYYOUNG, XASNPOPCY, XBLKPOPCY, XHISPOPCY, XHL25KCY, XP0TO5CY); *52 vars; %let catVars=%str(DFS0 DAYOFF01 ETHGP2 INSCAT MOMEMP MOMOCC_ MOMPBEXP PDECAY01 PDOPAIN); %writeModelingCARTcmd (firstCall=1, cmdfile=&cmdfile, usefile=&usefile, yvar=DFS0, xlist=&xlist, catVars=&catVars, minChild=12, atom=30, priors=EQUAL, misCost01=1, misCost10=1, serule=0.0, splitRule=GINI, power=0.0, outDir=&outDir, weightVar=);