坚强别无选择 发表于 2019-9-19 14:08:18

SAS 数据分析实例之数据描述、预处理和抽样


1 描述统计
1.1 描述表属性
ods noproctitle;
ods select attributes variables enginehost directory;

proc datasets lib=SASHELP;
        contents data=SASHELP.CARS order=collate
                out=WORK.TableAttributes (label="Contents Details for SASHELP.CARS");
quit;

proc print;
run;

1.2 描述数据特征
1.2.1 分析分类变量
title "分类变量的频数";

proc freq data=SASHELP.CARS;
        tables Make Model Type Origin DriveTrain / plots=(freqplot) missing;
run;

1.2.2 分析数值变量
title "数值变量的描述性统计量";

proc means data=SASHELP.CARS n nmiss min mean median max std;
        var MSRP Invoice EngineSize Cylinders Horsepower MPG_City MPG_Highway Weight
                Wheelbase Length;
run;

title;

proc univariate data=SASHELP.CARS noprint;
        histogram MSRP Invoice EngineSize Cylinders Horsepower MPG_City MPG_Highway
                Weight Wheelbase Length;
run;

1.3 描述缺失数据
ods noproctitle;

proc format;
        value _nmissprint low-high="非缺失";
        value $_cmissprint " "=" " other="非缺失";
run;

proc freq data=SASHELP.CARS;
        title3 "缺失数据频数";
        title4 h=2 "图例: .、A、B,其他 = 缺失";
        format MSRP Invoice EngineSize Cylinders Horsepower MPG_City MPG_Highway
                Weight Wheelbase Length _nmissprint.;
        format Make Model Type Origin DriveTrain $_cmissprint.;
        tables Make Model Type Origin DriveTrain MSRP Invoice EngineSize Cylinders
                Horsepower MPG_City MPG_Highway Weight Wheelbase Length / missing nocum;
run;

proc freq data=SASHELP.CARS noprint;
        table Make * Model * Type * Origin * DriveTrain * MSRP * Invoice * EngineSize
                * Cylinders * Horsepower * MPG_City * MPG_Highway * Weight * Wheelbase *
                Length / missing out=Work._MissingData_;
        format MSRP Invoice EngineSize Cylinders Horsepower MPG_City MPG_Highway
                Weight Wheelbase Length _nmissprint.;
        format Make Model Type Origin DriveTrain $_cmissprint.;
run;

proc print data=Work._MissingData_ noobs label;
        title3 "跨变量的缺失数据模式";
        title4 h=2 "图例: .、A、B,其他 = 缺失";
        format MSRP Invoice EngineSize Cylinders Horsepower MPG_City MPG_Highway
                Weight Wheelbase Length _nmissprint.;
        format Make Model Type Origin DriveTrain $_cmissprint.;
        label count="频数" percent="百分比";
run;

title3;

proc delete data=Work._MissingData_;
run;

2 数据预处理
2.1 列出数据
title1 "列出数据 - SASHELP.BASEBALL";

proc sort data=SASHELP.BASEBALL out=WORK.SORTTEMP;
        by Team;
run;

proc print data=WORK.SORTTEMP label n;
        var Position;
        by Team;
        sum Salary;
run;

proc delete data=work.SORTTEMP;
run;

title1;

2.2 过滤数据
proc sql noprint;
        create table WORK.filter as select * from SASHELP.BASEBALL where(Salary LT 100
                AND Position EQ "CF");
quit;

2.3 排序数据
proc sort data=SASHELP.BASEBALL out=WORK.sortDS noequals;
        by descending Salary;
run;

2.4 排名数据
proc rank data=SASHELP.BASEBALL descending out=WORK.Rank;
        var Salary;
        ranks rank_Salary;
run;

2.5 转换数据
data WORK.transform;
        set SASHELP.BASEBALL;
        log_Salary=log(Salary);
        inv_CrHits=1 / CrHits;
run;

2.6 标准化数据
proc stdize data=SASHELP.BASEBALL method=std nomiss out=WORK.Stdize oprefix
                sprefix=Standardized_;
        var Salary nHits nAtBat;
run;

2.7 重置码值
data WORK.recodedValues;
        set SASHELP.BASEBALL;

        select (Salary);
                when (100) _recodeVar_=99999;
                when (200) _recodeVar_=88888;
                otherwise _recodeVar_=Salary;
        end;
run;

2.8 重置码范围
data WORK.recodedRanges;
        set SASHELP.BASEBALL;

        select;
                when (-1 <=Salary <=100) _recodeVar_=100;
                otherwise _recodeVar_=Salary;
        end;
run;

3 随机抽样
3.1 简单随机抽样
3.1.1 无放回不重复抽样
proc surveyselect data=SASHELP.BASEBALL out=WORK.RandomSample
                method=srs samprate=0.3 seed=2019;
        strata Position / alloc=prop;
run;

3.1.2 有放回重复抽样
proc surveyselect data=SASHELP.BASEBALL out=WORK.RandomSample
        outhits method=urs sampsize=30 seed=2019;
        strata Position / alloc=prop;
run;


3.2 分层随机抽样
3.2.1 无放回不重复抽样
proc sort data=SASHELP.BASEBALL out=WORK.SORTTempTableSorted;
        by Position;
run;

proc surveyselect data=WORK.SORTTempTableSorted out=WORK.RandomSample
                method=srs samprate=0.3 seed=2019;
        strata Position / alloc=prop;
run;

3.2.2 有放回重复抽样
proc sort data=SASHELP.BASEBALL out=WORK.SORTTempTableSorted;
        by Position;
run;

proc surveyselect data=WORK.SORTTempTableSorted out=WORK.RandomSample
        outhits method=urs sampsize=30 seed=2019;
        strata Position / alloc=prop;
run;

karstification 发表于 2019-9-22 07:17:32

看一看,谢谢
页: [1]
查看完整版本: SAS 数据分析实例之数据描述、预处理和抽样