描述性分析

这里 我们主要学习三个过程的使用方法: MEANSUNIVARIATEtabulate

MEANS 过程

  • 先查一下sas 的帮助,可以看到如下对 MEANS 的描述:

使用 MEANS 过程计算特定的统计量

例1

  • 试一下下面的程序,看输出了哪些统计量
  • 例:
data cake;
   input LastName $ 1-12 Age 13-14 PresentScore 16-17
         TasteScore 19-20 Flavor $ 23-32 Layers 34 ;
   datalines;
Orlando     27 93 80  Vanilla    1
Ramey       32 84 72  Rum        2
Goldston    46 68 75  Vanilla    1
Roe         38 79 73  Vanilla    2
Larsen      23 77 84  Chocolate  .
Davis       51 86 91  Spice      3
Strickland  19 82 79  Chocolate  1
Nguyen      57 77 84  Vanilla    .
Hildenbrand 33 81 83  Chocolate  1
Byron       62 72 87  Vanilla    2
Sanders     26 56 79  Chocolate  1
Jaeger      43 66 74             1
Davis       28 69 75  Chocolate  2
Conrad      69 85 94  Vanilla    1
Walters     55 67 72  Chocolate  2
Rossburger  28 78 81  Spice      2
Matthew     42 81 92  Chocolate  2
Becker      36 62 83  Spice      2
Anderson    27 87 85  Chocolate  1
Merritt     62 73 84  Chocolate  1
;
proc means data=cake n mean max min range std fw=8;
   var PresentScore TasteScore;
   title 'Summary of Presentation and Taste Scores';
run;
  • 尝试修改一下程序调用下面的统计量,看看输出了什么?

例2

  • 使用 MEANS 过程输出指定变量的统计量。

data grade;
   input Name $ 1-8 Gender $ 11 Status $13 Year $ 15-16
         Section $ 18 Score 20-21 FinalGrade 23-24;
   datalines;
Abbott    F 2 97 A 90 87
Branford  M 1 98 A 92 97
Crandell  M 2 98 B 81 71
Dennison  M 1 97 A 85 72
Edgar     F 1 98 B 89 80
Faust     M 1 97 B 78 73
Greeley   F 2 97 A 82 91
Hart      F 1 98 B 84 80
Isley     M 2 97 A 88 86
Jasper    M 1 97 B 91 93
;
proc means data=grade maxdec=3;
   var Score;
   class Status Year;
   types () status*year;
   title 'Final Exam Grades for Student Status and Year of Graduation';
run;

将描述性统计写入SAS数据集中

DATA sales;    
   INFILE 'c:\MyRawData\Flowers.dat';
   INPUT CustomerID $ @9 SaleDate MMDDYY10. Petunia SnapDragon Marigold;
PROC SORT DATA = sales;
   BY CustomerID;

* Calculate means by CustomerID, output sum and mean to new data set;
PROC MEANS NOPRINT DATA = sales;
   BY CustomerID;
   VAR Petunia SnapDragon Marigold;
   OUTPUT OUT = totals  MEAN(Petunia SnapDragon Marigold) =
          MeanPetunia MeanSnapDragon MeanMarigold
      SUM(Petunia SnapDragon Marigold) = Petunia SnapDragon Marigold;
PROC PRINT DATA = totals;
   TITLE 'Sum of Flower Data over Customer ID';
   FORMAT MeanPetunia MeanSnapDragon MeanMarigold 3.;
RUN;
  • OUTPUT OUT=data-set output-statistic-list;
  • Data-set是要储存结果的数据集名,
  • output-statistic-list则界定需要保存哪些统计量和名称,
  • 可能的形式为:statistic(variable-list)=name-list
  • statistic可能是proc means语句中的任何一种统计量(sum,n,mean…),variable-list则界定VAR语句中哪些变量需要输出,name-list则定义统计量的新名字。

使用 freq 过程制作交叉表

DATA orders;
   INFILE 'c:\MyRawData\Coffee.dat';
   INPUT Coffee $ Window $ @@;

* Print tables for Window and Window by Coffee;
PROC FREQ DATA = orders;
   TABLES Window  Window * Coffee;
   RUN;

tabulate 过程产生表格

DATA boats;
   INFILE 'c:\MyRawData\Boats.dat';
   INPUT Name $ 1-12 Port $ 14-20 Locomotion $ 22-26 Type $ 28-30 
      Price 32-36;
RUN;

* Tabulations with three dimensions;
PROC TABULATE DATA = boats;
   CLASS Port Locomotion Type;
   TABLE Port, Locomotion, Type;
   TITLE 'Number of Boats by Port, Locomotion, and Type';
RUN;

UNIVARIATE 过程

  • 先查一下 UNIVARIATE 的帮助:

  • 使用 UNIVARIATE 输出多个变量的统计量

例3

data BPressure;
   length PatientID $2;
   input PatientID $ Systolic Diastolic @@; 
   datalines;
CK 120 50  SS 96  60 FR 100 70
CP 120 75  BL 140 90 ES 120 70
CP 165 110 JI 110 40 MC 119 66
FC 125 76  RW 133 60 KD 108 54
DS 110 50  JW 130 80 BH 120 65
JW 134 80  SB 118 76 NS 122 78
GS 122 70  AB 122 78 EC 112 62
HH 122 82
;

title 'Systolic and Diastolic Blood Pressure';
ods select BasicMeasures Quantiles;
proc univariate data=BPressure;
   var Systolic Diastolic;
run;

t检验

使用 TTEST 过程执行t检验

例4 (单样本t检验)检验BMI是否等于28.4:


DATA DIAB;
    INPUT PATNO WT_KG HT_CM @@;
    BMI  = WT_KG / ((HT_CM/100)**2);
    DATALINES;
 1 101.7 178     2  97.1 170
 3 114.2 191     4 101.9 179
 5  93.1 182     6 108.1 177
 7  85.0 184     8  89.1 182
 9  95.8 179    10  97.8 183
11  78.7   .    12  77.5 172
13 102.8 183    14  81.1 169
15 102.1 177    16 112.1 180
17  89.7 184
;
 
PROC PRINT DATA = DIAB;
    VAR PATNO HT_CM WT_KG BMI;
    FORMAT BMI 5.1;
    TITLE1 'One-Sample t-Test';
    TITLE2 'Body-Mass Index Data';
RUN;
 
PROC TTEST H0=28.4 DATA = DIAB;
    VAR BMI;
RUN;

例5 (配对样本t检验) 减肥药的功效

DATA OBESE;
    INPUT SUBJ WTPRE WTPST @@;
    WTLOSS = WTPRE - WTPST;
    DATALINES;
 1 165 160   2 202 200    3 256 259    4 155 156
 5 135 134   6 175 162    7 180 187    8 174 172
 9 136 138  10 168 162   11 207 197   12 155 155
13 220 205  14 163 153   15 159 150   16 253 255
17 138 128  18 287 280   19 177 171   20 181 170
21 148 154  22 167 170   23 190 180   24 165 154
25 155 150  26 153 145   27 205 206   28 186 184
29 178 166  30 129 132   31 125 127   32 165 169
33 156 158  34 170 161   35 145 152
;
 
PROC PRINT DATA = OBESE;
    VAR SUBJ WTPRE WTPST WTLOSS;
    TITLE1 'One-Sample t-Test';
    TITLE2 'Paired-Difference in Weight Loss';
RUN;
 
PROC TTEST DATA = OBESE;
    PAIRED WTPRE*WTPST;
RUN;

独立样本t检验

例5

DATA FEV;
    INPUT PATNO TRTGRP $ FEV0 FEV6 @@;
    CHG = FEV6 - FEV0;
    IF CHG = . THEN DELETE;
    DATALINES;
101 A 1.35  .     103 A 3.22 3.55   106 A 2.78 3.15
108 A 2.45 2.30   109 A 1.84 2.37   110 A 2.81 3.20
113 A 1.90 2.65   116 A 3.00 3.96   118 A 2.25 2.97
120 A 2.86 2.28   121 A 1.56 2.67   124 A 2.66 3.76
102 P 3.01 3.90   104 P 2.24 3.01   105 P 2.25 2.47
107 P 1.65 1.99   111 P 1.95  .     112 P 3.05 3.26
114 P 2.75 2.55   115 P 1.60 2.20   117 P 2.77 2.56
119 P 2.06 2.90   122 P 1.71  .     123 P 3.54 2.92
;
 
PROC FORMAT;
    VALUE $TRT 'A' = 'ABC-123'
               'P' = 'PLACEBO';
RUN;
 
PROC PRINT DATA = FEV;
    VAR PATNO TRTGRP FEV0 FEV6 CHG;
    FORMAT TRTGRP $TRT.  FEV0 FEV6 CHG 5.2;
    TITLE1 'Two-Sample t-Test';
    TITLE2 'FEV1 Changes';
RUN;
 
PROC MEANS MEAN STD N T PRT DATA = FEV;
    BY TRTGRP;
    VAR FEV0 FEV6 CHG;
    FORMAT TRTGRP $TRT.;
RUN;
 
PROC TTEST DATA = FEV;
    CLASS TRTGRP;
    VAR CHG;
    FORMAT TRTGRP $TRT.;
RUN;
  • 注意 format 过程 和 format 语句的使用
  • 注意数据步中 DELETE 语句的使用

作业 1

  • 使用下列语句生成read的数据集,使用count加权并对score进行单样本t检验,检验score的均值是否等于33。(提示:自行查阅freq 语句的使用方法)
data read;
   input score count @@;
   datalines;
40 2   47 2   52 2   26 1   19 2
25 2   35 4   39 1   26 1   48 1
14 2   22 1   42 1   34 2   33 2
18 1   15 1   29 1   41 2   44 1
51 1   43 1   27 2   46 2   28 1
49 1   31 1   28 1   54 1   45 1
;

方差分析

单因素方差分析

data PainRelief;
   input PainLevel Codeine Acupuncture Relief @@;
   datalines;
1 1 1 0.0  1 2 1 0.5  1 1 2 0.6  1 2 2 1.2
2 1 1 0.3  2 2 1 0.6  2 1 2 0.7  2 2 2 1.3
3 1 1 0.4  3 2 1 0.8  3 1 2 0.8  3 2 2 1.6
4 1 1 0.4  4 2 1 0.7  4 1 2 0.9  4 2 2 1.5
5 1 1 0.6  5 2 1 1.0  5 1 2 1.5  5 2 2 1.9
6 1 1 0.9  6 2 1 1.4  6 1 2 1.6  6 2 2 2.3
7 1 1 1.0  7 2 1 1.8  7 1 2 1.7  7 2 2 2.1
8 1 1 1.2  8 2 1 1.7  8 1 2 1.6  8 2 2 2.4
;

proc anova data=PainRelief;
   class PainLevel Codeine Acupuncture;
   model Relief = PainLevel Codeine|Acupuncture;
run;
 

作业2

  • 利用下列语句生成数据集clover, 对不同的strain检验Nitrogen的均值是否一致。
data Clover;
   input Strain $ Nitrogen @@;
   datalines;
3DOK1  19.4 3DOK1  32.6 3DOK1  27.0 3DOK1  32.1 3DOK1  33.0
3DOK5  17.7 3DOK5  24.8 3DOK5  27.9 3DOK5  25.2 3DOK5  24.3
3DOK4  17.0 3DOK4  19.4 3DOK4   9.1 3DOK4  11.9 3DOK4  15.8
3DOK7  20.7 3DOK7  21.0 3DOK7  20.5 3DOK7  18.8 3DOK7  18.6
3DOK13 14.3 3DOK13 14.4 3DOK13 11.8 3DOK13 11.6 3DOK13 14.2
COMPOS 17.3 COMPOS 19.4 COMPOS 19.1 COMPOS 16.9 COMPOS 20.8
;