设为首页收藏本站

EPS数据狗论坛

 找回密码
 立即注册

QQ登录

只需一步,快速开始

查看: 1468|回复: 1

SAS 数据分析实例之数据描述、预处理和抽样

[复制链接]

22

主题

168

金钱

270

积分

入门用户

发表于 2019-9-19 14:08:18 | 显示全部楼层 |阅读模式

1 描述统计
1.1 描述表属性
  1. ods noproctitle;
  2. ods select attributes variables enginehost directory;

  3. proc datasets lib=SASHELP;
  4.         contents data=SASHELP.CARS order=collate
  5.                 out=WORK.TableAttributes (label="Contents Details for SASHELP.CARS");
  6. quit;

  7. proc print;
  8. run;
复制代码


1.2 描述数据特征
1.2.1 分析分类变量
  1. title "分类变量的频数";

  2. proc freq data=SASHELP.CARS;
  3.         tables Make Model Type Origin DriveTrain / plots=(freqplot) missing;
  4. run;
复制代码


1.2.2 分析数值变量
  1. title "数值变量的描述性统计量";

  2. proc means data=SASHELP.CARS n nmiss min mean median max std;
  3.         var MSRP Invoice EngineSize Cylinders Horsepower MPG_City MPG_Highway Weight
  4.                 Wheelbase Length;
  5. run;

  6. title;

  7. proc univariate data=SASHELP.CARS noprint;
  8.         histogram MSRP Invoice EngineSize Cylinders Horsepower MPG_City MPG_Highway
  9.                 Weight Wheelbase Length;
  10. run;
复制代码


1.3 描述缺失数据
  1. ods noproctitle;

  2. proc format;
  3.         value _nmissprint low-high="非缺失";
  4.         value $_cmissprint " "=" " other="非缺失";
  5. run;

  6. proc freq data=SASHELP.CARS;
  7.         title3 "缺失数据频数";
  8.         title4 h=2 "图例: .、A、B,其他 = 缺失";
  9.         format MSRP Invoice EngineSize Cylinders Horsepower MPG_City MPG_Highway
  10.                 Weight Wheelbase Length _nmissprint.;
  11.         format Make Model Type Origin DriveTrain $_cmissprint.;
  12.         tables Make Model Type Origin DriveTrain MSRP Invoice EngineSize Cylinders
  13.                 Horsepower MPG_City MPG_Highway Weight Wheelbase Length / missing nocum;
  14. run;

  15. proc freq data=SASHELP.CARS noprint;
  16.         table Make * Model * Type * Origin * DriveTrain * MSRP * Invoice * EngineSize
  17.                 * Cylinders * Horsepower * MPG_City * MPG_Highway * Weight * Wheelbase *
  18.                 Length / missing out=Work._MissingData_;
  19.         format MSRP Invoice EngineSize Cylinders Horsepower MPG_City MPG_Highway
  20.                 Weight Wheelbase Length _nmissprint.;
  21.         format Make Model Type Origin DriveTrain $_cmissprint.;
  22. run;

  23. proc print data=Work._MissingData_ noobs label;
  24.         title3 "跨变量的缺失数据模式";
  25.         title4 h=2 "图例: .、A、B,其他 = 缺失";
  26.         format MSRP Invoice EngineSize Cylinders Horsepower MPG_City MPG_Highway
  27.                 Weight Wheelbase Length _nmissprint.;
  28.         format Make Model Type Origin DriveTrain $_cmissprint.;
  29.         label count="频数" percent="百分比";
  30. run;

  31. title3;

  32. proc delete data=Work._MissingData_;
  33. run;
复制代码


2 数据预处理
2.1 列出数据
  1. title1 "列出数据 - SASHELP.BASEBALL";

  2. proc sort data=SASHELP.BASEBALL out=WORK.SORTTEMP;
  3.         by Team;
  4. run;

  5. proc print data=WORK.SORTTEMP label n;
  6.         var Position;
  7.         by Team;
  8.         sum Salary;
  9. run;

  10. proc delete data=work.SORTTEMP;
  11. run;

  12. title1;
复制代码


2.2 过滤数据
  1. proc sql noprint;
  2.         create table WORK.filter as select * from SASHELP.BASEBALL where(Salary LT 100
  3.                 AND Position EQ "CF");
  4. quit;
复制代码


2.3 排序数据
  1. proc sort data=SASHELP.BASEBALL out=WORK.sortDS noequals;
  2.         by descending Salary;
  3. run;
复制代码


2.4 排名数据
  1. proc rank data=SASHELP.BASEBALL descending out=WORK.Rank;
  2.         var Salary;
  3.         ranks rank_Salary;
  4. run;
复制代码


2.5 转换数据
  1. data WORK.transform;
  2.         set SASHELP.BASEBALL;
  3.         log_Salary=log(Salary);
  4.         inv_CrHits=1 / CrHits;
  5. run;
复制代码


2.6 标准化数据
  1. proc stdize data=SASHELP.BASEBALL method=std nomiss out=WORK.Stdize oprefix
  2.                 sprefix=Standardized_;
  3.         var Salary nHits nAtBat;
  4. run;
复制代码


2.7 重置码值
  1. data WORK.recodedValues;
  2.         set SASHELP.BASEBALL;

  3.         select (Salary);
  4.                 when (100) _recodeVar_=99999;
  5.                 when (200) _recodeVar_=88888;
  6.                 otherwise _recodeVar_=Salary;
  7.         end;
  8. run;
复制代码


2.8 重置码范围
  1. data WORK.recodedRanges;
  2.         set SASHELP.BASEBALL;

  3.         select;
  4.                 when (-1 <=Salary <=100) _recodeVar_=100;
  5.                 otherwise _recodeVar_=Salary;
  6.         end;
  7. run;
复制代码


3 随机抽样
3.1 简单随机抽样
3.1.1 无放回不重复抽样
  1. proc surveyselect data=SASHELP.BASEBALL out=WORK.RandomSample
  2.                 method=srs samprate=0.3 seed=2019;
  3.         strata Position / alloc=prop;
  4. run;
复制代码


3.1.2 有放回重复抽样
  1. proc surveyselect data=SASHELP.BASEBALL out=WORK.RandomSample
  2.         outhits method=urs sampsize=30 seed=2019;
  3.         strata Position / alloc=prop;
  4. run;
复制代码



3.2 分层随机抽样
3.2.1 无放回不重复抽样
  1. proc sort data=SASHELP.BASEBALL out=WORK.SORTTempTableSorted;
  2.         by Position;
  3. run;

  4. proc surveyselect data=WORK.SORTTempTableSorted out=WORK.RandomSample
  5.                 method=srs samprate=0.3 seed=2019;
  6.         strata Position / alloc=prop;
  7. run;
复制代码


3.2.2 有放回重复抽样
  1. proc sort data=SASHELP.BASEBALL out=WORK.SORTTempTableSorted;
  2.         by Position;
  3. run;

  4. proc surveyselect data=WORK.SORTTempTableSorted out=WORK.RandomSample
  5.         outhits method=urs sampsize=30 seed=2019;
  6.         strata Position / alloc=prop;
  7. run;
复制代码

224

主题

2万

金钱

3万

积分

专家用户

发表于 2019-9-22 07:17:32 | 显示全部楼层
看一看,谢谢
大道无痕……
回复 支持 反对

使用道具 举报

您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

关闭

站长推荐上一条 /1 下一条

客服中心
关闭
在线时间:
周一~周五
8:30-17:30
QQ群:
653541906
联系电话:
010-85786021-8017
在线咨询
客服中心

意见反馈|网站地图|手机版|小黑屋|EPS数据狗论坛 ( 京ICP备09019565号-3 )   

Powered by BFIT! X3.4

© 2008-2028 BFIT Inc.

快速回复 返回顶部 返回列表