Perl code to convert SAS code to Stata code. This works if a project provides SAS statements to read in their fixed-width text file data format, as is reasonably common. Wrote this up to convert MEPS data without using SAS, and thought someone out there might find it useful. It fails if the value labels aren't all numeric, but that should be easy enough to fix.
--Ari
Usage:
./SAS_to_Stata.pl <> outfile.do
Code:
#!/usr/bin/perl
print "* Convert basic SAS file to read in fixed format into Stata equivalent\n";
# Initialize variables
$input = 0;
$label = 0;
$value = 0;
$format = 0;
# Loop through each line
while() {
my($line) = $_;
chomp($line);
# -- INPUT -- #
# If it's an INPUT statement, put us in a receptive state
if ($line =~ /(INPUT)/) {
$input=1;
print "infix ";
} elsif ($input==1 & $line =~ /;/) { #Take us out of receptive state at the ";"
$input=0;
print "using ____.dta, clear\n";
}
# If we're in a receptive state and the line is parseable as an input line, parse it
if (($input==1) & ($line =~ /\s+\@(\d+)\s+([A-Z\d]+)\s+([\$\s])(\d+).+$/)) {
$startcol = $1;
$varname = $2;
$endcol = $startcol + $4 - 1;
if ($3 eq "\$") {
$str = "str";
} else {
$str = "";
}
print "$str $varname $startcol-$endcol ";
}
# -- LABELS -- #
# If it's a LABEL statement, put us in a receptive state
if ($line =~ /(LABEL)/) {
$label=1;
} elsif ($label==1 & $line =~ /;/) {
$label=0;
}
# If we're in a receptive state and the line is parseable as a label line, parse it
if ($label==1 & $line=~/([A-Z\d]+)\s*=('.+')$/) {
print "label variable $1 $2\n";
}
# -- VALUE LABELS -- #
# If it's a VALUE statement, put us in a receptive state
if ($line =~ /(VALUE) (\$?)([A-Za-z]+)\s+$/) {
$value=1;
print "label define $3 ";
} elsif ($line =~ /;/) {
$value=0;
print "\n";
}
# If we're in a receptive state and the line is parseable as a value line, parse it
if ($value==1 & $line=~/\s*'?([-.]\d)'?\s*=\s*('.+')\s*$/) {
print "$1 $2 ";
}
# -- APPLY VALUE LABELS TO VARIABLES -- #
# If it's a FORMAT statement, put us in a receptive state
if ($line =~ /(FORMAT)/) {
$format=1;
} elsif ($format==1 & $line =~ /;/) {
$format=0;
}
# If we're in a receptive state and the line is parseable as a value line, parse it
if ($format==1 & $line=~/\s+([A-Za-z\d]+) (\$?)([A-Za-z\d]+)\.$/) {
print "label values $1 $3\n";
}
}
--Ari
Usage:
./SAS_to_Stata.pl <> outfile.do
Code:
#!/usr/bin/perl
print "* Convert basic SAS file to read in fixed format into Stata equivalent\n";
# Initialize variables
$input = 0;
$label = 0;
$value = 0;
$format = 0;
# Loop through each line
while(
my($line) = $_;
chomp($line);
# -- INPUT -- #
# If it's an INPUT statement, put us in a receptive state
if ($line =~ /(INPUT)/) {
$input=1;
print "infix ";
} elsif ($input==1 & $line =~ /;/) { #Take us out of receptive state at the ";"
$input=0;
print "using ____.dta, clear\n";
}
# If we're in a receptive state and the line is parseable as an input line, parse it
if (($input==1) & ($line =~ /\s+\@(\d+)\s+([A-Z\d]+)\s+([\$\s])(\d+).+$/)) {
$startcol = $1;
$varname = $2;
$endcol = $startcol + $4 - 1;
if ($3 eq "\$") {
$str = "str";
} else {
$str = "";
}
print "$str $varname $startcol-$endcol ";
}
# -- LABELS -- #
# If it's a LABEL statement, put us in a receptive state
if ($line =~ /(LABEL)/) {
$label=1;
} elsif ($label==1 & $line =~ /;/) {
$label=0;
}
# If we're in a receptive state and the line is parseable as a label line, parse it
if ($label==1 & $line=~/([A-Z\d]+)\s*=('.+')$/) {
print "label variable $1 $2\n";
}
# -- VALUE LABELS -- #
# If it's a VALUE statement, put us in a receptive state
if ($line =~ /(VALUE) (\$?)([A-Za-z]+)\s+$/) {
$value=1;
print "label define $3 ";
} elsif ($line =~ /;/) {
$value=0;
print "\n";
}
# If we're in a receptive state and the line is parseable as a value line, parse it
if ($value==1 & $line=~/\s*'?([-.]\d)'?\s*=\s*('.+')\s*$/) {
print "$1 $2 ";
}
# -- APPLY VALUE LABELS TO VARIABLES -- #
# If it's a FORMAT statement, put us in a receptive state
if ($line =~ /(FORMAT)/) {
$format=1;
} elsif ($format==1 & $line =~ /;/) {
$format=0;
}
# If we're in a receptive state and the line is parseable as a value line, parse it
if ($format==1 & $line=~/\s+([A-Za-z\d]+) (\$?)([A-Za-z\d]+)\.$/) {
print "label values $1 $3\n";
}
}