需要安装的包:

install.packages("reshape2") 

建立数据集:

# Class Roster Dataset
Student <- c("John Davis","Angela Williams","Bullwinkle Moose",
             "David Jones","Janice Markhammer",
             "Cheryl Cushing","Reuven Ytzrhak",
             "Greg Knox","Joel England","Mary Rayburn")
math <- c(502, 600, 412, 358, 495, 512, 410, 625, 573, 522)
science <- c(95, 99, 80, 82, 75, 85, 80, 95, 89, 86)
english <- c(25, 22, 18, 15, 20, 28, 15, 30, 27, 18)
roster <- data.frame(Student, math, science, english, stringsAsFactors=FALSE)

为了给所有学生确定一个单一的成绩衡量指标,需要将这些科目的成绩组合起来。另外,你还想将前20%的学生评定为A,接下来20%的学生评定为B,依次类推。最后,你希望按字母顺序对学生排序。

数值和字符处理函数

数学函数

计算某个数值向量的均值和标准差的两种方式:

# Calculating the mean and standard deviation
x <- c(1, 2, 3, 4, 5, 6, 7, 8)
mean(x)
## [1] 4.5
sd(x)
## [1] 2.449
n <- length(x)
meanx <- sum(x)/n
css <- sum((x - meanx)**2)            
sdx <- sqrt(css / (n-1))
meanx
## [1] 4.5
sdx
## [1] 2.449

统计函数:

概率函数

其中第一个字母表示其所指分布的某一方面:

  • d = 密度函数(density)
  • p = 分布函数(distribution function)
  • q = 分位数函数(quantile function)
  • r = 生成随机数(随机偏差)
#  Generating pseudo-random numbers from 
# a uniform distribution
runif(5)
## [1] 0.3363 0.3381 0.8801 0.7188 0.8684
runif(5)
## [1] 0.1445 0.8612 0.5172 0.9467 0.8904
set.seed(1234)                                                     
runif(5)
## [1] 0.1137 0.6223 0.6093 0.6234 0.8609
set.seed(1234)                                                      
runif(5)
## [1] 0.1137 0.6223 0.6093 0.6234 0.8609

多元正态分布

#  Generating data from a multivariate
# normal distribution
library(MASS)
mean <- c(230.7, 146.7, 3.6)                                           
sigma <- matrix( c(15360.8, 6721.2, -47.1,    6721.2, 4700.9, -16.5,-47.1,  -16.5,   0.3), nrow=3, ncol=3)

set.seed(1234)
mydata <- mvrnorm(500, mean, sigma)                                     
mydata <- as.data.frame(mydata)                                         
names(mydata) <- c("y", "x1", "x2")                                       
dim(mydata)                                                             
## [1] 500   3
head(mydata, n=10)   
##         y     x1    x2
## 1   98.77  41.26 3.433
## 2  244.46 205.20 3.796
## 3  375.66 186.71 2.513
## 4  -59.22  11.22 4.712
## 5  312.97 110.99 3.449
## 6  288.82 185.09 2.724
## 7  134.78 164.99 4.394
## 8  171.72  97.38 3.640
## 9  167.25 101.03 3.495
## 10 121.09  94.48 4.096

字符处理函数

其他实用函数

函数应用于矩阵和数据框

#  - Applying functions to data objects
a <- 5
sqrt(a)
## [1] 2.236
b <- c(1.243, 5.654, 2.99)
round(b)
## [1] 1 6 3
c <- matrix(runif(12), nrow=3)
c
##        [,1]   [,2]   [,3]   [,4]
## [1,] 0.9636 0.2160 0.2890 0.9128
## [2,] 0.2068 0.2396 0.8041 0.3534
## [3,] 0.0862 0.1972 0.3782 0.9315
log(c)
##          [,1]   [,2]    [,3]     [,4]
## [1,] -0.03708 -1.532 -1.2413 -0.09122
## [2,] -1.57619 -1.429 -0.2180 -1.04018
## [3,] -2.45111 -1.624 -0.9722 -0.07097
mean(c)
## [1] 0.4649
#  Listing 5.5 - Applying a function to the rows (columns) of a matrix
mydata <- matrix(rnorm(30), nrow=6)
mydata
##         [,1]    [,2]    [,3]    [,4]    [,5]
## [1,]  0.4585  1.2031  1.2339  0.5905 -0.2806
## [2,] -1.2611  0.7689 -1.8914 -0.4351  0.8121
## [3,] -0.5275  0.2384 -0.2227 -0.2508 -0.2077
## [4,] -0.5568 -1.4150  0.7681 -0.9263  1.4508
## [5,] -0.3744  2.9338  0.3880  1.0874  0.8415
## [6,] -0.6044  0.9350  0.6091 -1.9440 -0.8657
apply(mydata, 1, mean)     
## [1]  0.6411 -0.4013 -0.1940 -0.1358  0.9752 -0.3740
apply(mydata, 2, mean) 
## [1] -0.4776  0.7774  0.1475 -0.3130  0.2917
apply(mydata, 2, mean, trim=.4)   
## [1] -0.5421  0.8519  0.4985 -0.3430  0.3022

处理数据

解决开始的问题:

# A solution to the learning example
options(digits=2)
Student <- c("John Davis", "Angela Williams", "Bullwinkle Moose","David Jones", "Janice Markhammer", "Cheryl Cushing","Reuven Ytzrhak", "Greg Knox", "Joel England","Mary Rayburn")
Math <- c(502, 600, 412, 358, 495, 512, 410, 625, 573, 522)
Science <- c(95, 99, 80, 82, 75, 85, 80, 95, 89, 86)
English <- c(25, 22, 18, 15, 20, 28, 15, 30, 27, 18)

roster <- data.frame(Student, Math, Science, English,stringsAsFactors=FALSE)

z <- scale(roster[,2:4])
score <- apply(z, 1, mean)
roster <- cbind(roster, score)

y <- quantile(score, c(.8,.6,.4,.2))
roster$grade[score >= y[1]] <- "A"
roster$grade[score < y[1] & score >= y[2]] <- "B"
roster$grade[score < y[2] & score >= y[3]] <- "C"
roster$grade[score < y[3] & score >= y[4]] <- "D"
roster$grade[score < y[4]] <- "F"

name <- strsplit((roster$Student), " ")
Lastname <- sapply(name, "[", 2)
Firstname <- sapply(name, "[", 1)
roster <- cbind(Firstname,Lastname, roster[,-1])
roster <- roster[order(Lastname,Firstname),]

roster
##     Firstname   Lastname Math Science English score grade
## 6      Cheryl    Cushing  512      85      28  0.35     C
## 1        John      Davis  502      95      25  0.56     B
## 9        Joel    England  573      89      27  0.70     B
## 4       David      Jones  358      82      15 -1.16     F
## 8        Greg       Knox  625      95      30  1.34     A
## 5      Janice Markhammer  495      75      20 -0.63     D
## 3  Bullwinkle      Moose  412      80      18 -0.86     D
## 10       Mary    Rayburn  522      86      18 -0.18     C
## 2      Angela   Williams  600      99      22  0.92     A
## 7      Reuven    Ytzrhak  410      80      15 -1.05     F

语句控制

重复和循环

for结构

for (var in seq) statement

while结构

while (cond) statement

条件执行

if-else结构

if (cond) statement
if (cond) statement1 else statement2

ifelse结构

ifelse(cond, statement1, statement2)

若cond为TRUE,则执行第一个语句;若cond为FALSE,则执行第二个语句.

switch结构

switch(expr, ...)
#  A switch example
feelings <- c("sad", "afraid")
for (i in feelings)
  print(
    switch(i,
           happy  = "I am glad you are happy",
           afraid = "There is nothing to fear",
           sad    = "Cheer up",
           angry  = "Calm down now"
    )
  )
## [1] "Cheer up"
## [1] "There is nothing to fear"

自编函数

编写一个函数,用来计算数据对象的集中趋势和散布情况。此函数应当可以选择性地给出参数统计量(均值和标准差)和非参数统计量(中位数和绝对中位差)。结果应当以一个含名称列表的形式给出。另外,用户应当可以选择是否自动输出结果。除非另外指定,否则此函数的默认行为应当是计算参数统计量并且不输出结果。

#  - mystats(): a user-written function for 
# summary statistics
mystats <- function(x, parametric=TRUE, print=FALSE) {
  if (parametric) {
    center <- mean(x); spread <- sd(x)
  } else {
    center <- median(x); spread <- mad(x)
  }
  if (print & parametric) {
    cat("Mean=", center, "\n", "SD=", spread, "\n")
  } else if (print & !parametric) {
    cat("Median=", center, "\n", "MAD=", spread, "\n")
  }
  result <- list(center=center, spread=spread)
  return(result)
}
# trying it out
set.seed(1234)
x <- rnorm(500) 
y <- mystats(x)
y <- mystats(x, parametric=FALSE, print=TRUE)
## Median= -0.021 
##  MAD= 1
# mydate: a user-written function using switch
mydate <- function(type="long") {
  switch(type,
         long =  format(Sys.time(), "%A %B %d %Y"), 
         short = format(Sys.time(), "%m-%d-%y"),
         cat(type, "is not a recognized type\n"))
}
mydate("long")
## [1] "星期四 九月 01 2016"
mydate("short")
## [1] "09-01-16"
mydate()
## [1] "星期四 九月 01 2016"
mydate("medium")
## medium is not a recognized type

整合与重构

数据集的转置

mtcars这个数据集是从Motor Trend杂志(1974)提取的,它描述了34种车型的设计和性能特点(汽缸数、排量、马力、每加仑汽油行驶的英里数,等等)

head(mtcars)
##                   mpg cyl disp  hp drat  wt qsec vs am gear carb
## Mazda RX4          21   6  160 110  3.9 2.6   16  0  1    4    4
## Mazda RX4 Wag      21   6  160 110  3.9 2.9   17  0  1    4    4
## Datsun 710         23   4  108  93  3.8 2.3   19  1  1    4    1
## Hornet 4 Drive     21   6  258 110  3.1 3.2   19  1  0    3    1
## Hornet Sportabout  19   8  360 175  3.1 3.4   17  0  0    3    2
## Valiant            18   6  225 105  2.8 3.5   20  1  0    3    1
#  Transposing a dataset
cars <- mtcars[1:5, 1:4]      
cars
##                   mpg cyl disp  hp
## Mazda RX4          21   6  160 110
## Mazda RX4 Wag      21   6  160 110
## Datsun 710         23   4  108  93
## Hornet 4 Drive     21   6  258 110
## Hornet Sportabout  19   8  360 175
t(cars)
##      Mazda RX4 Mazda RX4 Wag Datsun 710 Hornet 4 Drive Hornet Sportabout
## mpg         21            21         23             21                19
## cyl          6             6          4              6                 8
## disp       160           160        108            258               360
## hp         110           110         93            110               175

整合数据

根据汽缸数(cyl)和挡位数(gear)整合mtcars数据,并返回各个数值型变量的均值

# - Aggregating data
options(digits=3)
attach(mtcars)
aggdata <-aggregate(mtcars, by=list(cyl,gear), 
                    FUN=mean, na.rm=TRUE)
aggdata
##   Group.1 Group.2  mpg cyl disp  hp drat   wt qsec  vs   am gear carb
## 1       4       3 21.5   4  120  97 3.70 2.46 20.0 1.0 0.00    3 1.00
## 2       6       3 19.8   6  242 108 2.92 3.34 19.8 1.0 0.00    3 1.00
## 3       8       3 15.1   8  358 194 3.12 4.10 17.1 0.0 0.00    3 3.08
## 4       4       4 26.9   4  103  76 4.11 2.38 19.6 1.0 0.75    4 1.50
## 5       6       4 19.8   6  164 116 3.91 3.09 17.7 0.5 0.50    4 4.00
## 6       4       5 28.2   4  108 102 4.10 1.83 16.8 0.5 1.00    5 2.00
## 7       6       5 19.7   6  145 175 3.62 2.77 15.5 0.0 1.00    5 6.00
## 8       8       5 15.4   8  326 300 3.88 3.37 14.6 0.0 1.00    5 6.00

reshape2

rm(list = ls(all = TRUE)) 
# Using the reshape2 package
library(reshape2)

# input data
mydata <- read.table(header=TRUE, sep=" ", text="
ID Time X1 X2
1 1 5 6
1 2 3 5
2 1 6 1
2 2 2 4
")

融合

# melt data
md <- melt(mydata, id=c("ID", "Time"))
head(mydata)
##   ID Time X1 X2
## 1  1    1  5  6
## 2  1    2  3  5
## 3  2    1  6  1
## 4  2    2  2  4
head(md)
##   ID Time variable value
## 1  1    1       X1     5
## 2  1    2       X1     3
## 3  2    1       X1     6
## 4  2    2       X1     2
## 5  1    1       X2     6
## 6  1    2       X2     5

重铸

# reshaping with aggregation
dcast(md, ID~variable, mean)
##   ID X1  X2
## 1  1  4 5.5
## 2  2  4 2.5
dcast(md, Time~variable, mean)
##   Time  X1  X2
## 1    1 5.5 3.5
## 2    2 2.5 4.5
dcast(md, ID~Time, mean)
##   ID   1 2
## 1  1 5.5 4
## 2  2 3.5 3
# reshaping without aggregation
dcast(md, ID+Time~variable)
##   ID Time X1 X2
## 1  1    1  5  6
## 2  1    2  3  5
## 3  2    1  6  1
## 4  2    2  2  4
dcast(md, ID+variable~Time)
##   ID variable 1 2
## 1  1       X1 5 3
## 2  1       X2 6 5
## 3  2       X1 6 2
## 4  2       X2 1 4
dcast(md, ID~variable+Time)
##   ID X1_1 X1_2 X2_1 X2_2
## 1  1    5    3    6    5
## 2  2    6    2    1    4
detach("package:reshape2", unload=TRUE)

返回课程主页