需要安装的包:

install.packages("ISwR")

R语言初步

来一个R的程序吧:

plot(rnorm(500))

计算器

2 + 2
## [1] 4
exp(-2)
## [1] 0.1353
rnorm(15)
##  [1] -0.229650  0.401697 -1.370342 -0.004025 -0.118978 -0.696247 -0.726071
##  [8] -2.412498  1.631338  1.495504 -1.263071  1.404315 -0.955993 -1.178038
## [15]  1.021226

赋值语句 Assignments <-

x <- 2
x
## [1] 2
x + x
## [1] 4

向量

计算bmi:

weight <- c(60, 72, 57, 90, 95, 72)
weight
## [1] 60 72 57 90 95 72
height <- c(1.75, 1.80, 1.65, 1.90, 1.74, 1.91)
bmi <- weight/height^2
bmi
## [1] 19.59 22.22 20.94 24.93 31.38 19.74

计算身高体重的均值和标准差:

sum(weight)
## [1] 446
sum(weight)/length(weight)
## [1] 74.33
xbar <- sum(weight)/length(weight)
weight - xbar
## [1] -14.333  -2.333 -17.333  15.667  20.667  -2.333
(weight - xbar)^2
## [1] 205.444   5.444 300.444 245.444 427.111   5.444
sum((weight - xbar)^2)
## [1] 1189
sqrt(sum((weight - xbar)^2)/(length(weight) - 1))
## [1] 15.42
mean(weight)
## [1] 74.33
sd(weight)
## [1] 15.42

t 检验

t.test(bmi, mu=22.5)
## 
##  One Sample t-test
## 
## data:  bmi
## t = 0.34, df = 5, p-value = 0.7
## alternative hypothesis: true mean is not equal to 22.5
## 95 percent confidence interval:
##  18.42 27.85
## sample estimates:
## mean of x 
##     23.13

画图

plot(height,weight)

plot(height, weight, pch=2)

基本的操作

显示和设定工作目录

getwd()
## [1] "E:/mywork/study_R/R_sim_2016/R_base_2"
setwd("E:/mywork/study_R/R_sim_2016")  #设定你的工作目录
setwd("E:/mywork/study_R/R_sim_2016/R_base_2")  #设定你的工作目录
getwd()
## [1] "E:/mywork/study_R/R_sim_2016/R_base_2"

显示 workplace

ls()
## [1] "bmi"    "height" "weight" "x"      "xbar"

删除工作空间特定对象

rm(height, weight)
ls()
## [1] "bmi"  "x"    "xbar"

存储工作空间

save.image()

attachdetach 命令

attach(ISwR::thuesen)##请提前安装包“ISwR”
blood.glucose
##  [1] 15.3 10.8  8.1 19.5  7.2  5.3  9.3 11.1  7.5 12.2  6.7  5.2 19.0 15.1
## [15]  6.7  8.6  4.2 10.3 12.5 16.1 13.3  4.9  8.8  9.5
search()
##  [1] ".GlobalEnv"        "ISwR::thuesen"     "package:stats"    
##  [4] "package:graphics"  "package:grDevices" "package:utils"    
##  [7] "package:datasets"  "package:methods"   "Autoloads"        
## [10] "package:base"
with(ISwR::thuesen, plot(blood.glucose, short.velocity))

detach()
search()
## [1] ".GlobalEnv"        "package:stats"     "package:graphics" 
## [4] "package:grDevices" "package:utils"     "package:datasets" 
## [7] "package:methods"   "Autoloads"         "package:base"

subsettransform、 和 within 的用法

library("ISwR")
thuesen
##    blood.glucose short.velocity
## 1           15.3           1.76
## 2           10.8           1.34
## 3            8.1           1.27
## 4           19.5           1.47
## 5            7.2           1.27
## 6            5.3           1.49
## 7            9.3           1.31
## 8           11.1           1.09
## 9            7.5           1.18
## 10          12.2           1.22
## 11           6.7           1.25
## 12           5.2           1.19
## 13          19.0           1.95
## 14          15.1           1.28
## 15           6.7           1.52
## 16           8.6             NA
## 17           4.2           1.12
## 18          10.3           1.37
## 19          12.5           1.19
## 20          16.1           1.05
## 21          13.3           1.32
## 22           4.9           1.03
## 23           8.8           1.12
## 24           9.5           1.70
#knitr::kable(thuesen)
thue2 <- subset(thuesen,blood.glucose<7)
thue2
##    blood.glucose short.velocity
## 6            5.3           1.49
## 11           6.7           1.25
## 12           5.2           1.19
## 15           6.7           1.52
## 17           4.2           1.12
## 22           4.9           1.03
thue3 <- transform(thuesen,log.gluc=log(blood.glucose))
thue3
##    blood.glucose short.velocity log.gluc
## 1           15.3           1.76    2.728
## 2           10.8           1.34    2.380
## 3            8.1           1.27    2.092
## 4           19.5           1.47    2.970
## 5            7.2           1.27    1.974
## 6            5.3           1.49    1.668
## 7            9.3           1.31    2.230
## 8           11.1           1.09    2.407
## 9            7.5           1.18    2.015
## 10          12.2           1.22    2.501
## 11           6.7           1.25    1.902
## 12           5.2           1.19    1.649
## 13          19.0           1.95    2.944
## 14          15.1           1.28    2.715
## 15           6.7           1.52    1.902
## 16           8.6             NA    2.152
## 17           4.2           1.12    1.435
## 18          10.3           1.37    2.332
## 19          12.5           1.19    2.526
## 20          16.1           1.05    2.779
## 21          13.3           1.32    2.588
## 22           4.9           1.03    1.589
## 23           8.8           1.12    2.175
## 24           9.5           1.70    2.251
thue4 <- within(thuesen,{
   log.gluc <- log(blood.glucose)
   m <- mean(log.gluc)
   centered.log.gluc <- log.gluc - m
   rm(m)
})
thue4
##    blood.glucose short.velocity centered.log.gluc log.gluc
## 1           15.3           1.76          0.481880    2.728
## 2           10.8           1.34          0.133573    2.380
## 3            8.1           1.27         -0.154109    2.092
## 4           19.5           1.47          0.724441    2.970
## 5            7.2           1.27         -0.271892    1.974
## 6            5.3           1.49         -0.578266    1.668
## 7            9.3           1.31         -0.015959    2.230
## 8           11.1           1.09          0.160972    2.407
## 9            7.5           1.18         -0.231070    2.015
## 10          12.2           1.22          0.255463    2.501
## 11           6.7           1.25         -0.343865    1.902
## 12           5.2           1.19         -0.597314    1.649
## 13          19.0           1.95          0.698466    2.944
## 14          15.1           1.28          0.468722    2.715
## 15           6.7           1.52         -0.343865    1.902
## 16           8.6             NA         -0.094211    2.152
## 17           4.2           1.12         -0.810888    1.435
## 18          10.3           1.37          0.086171    2.332
## 19          12.5           1.19          0.279756    2.526
## 20          16.1           1.05          0.532846    2.779
## 21          13.3           1.32          0.341791    2.588
## 22           4.9           1.03         -0.656738    1.589
## 23           8.8           1.12         -0.071221    2.175
## 24           9.5           1.70          0.005319    2.251

获得帮助

#help(mean)
#?"mean"

包的加载

library(survival)
## 
## Attaching package: 'survival'
## The following object is masked from 'package:ISwR':
## 
##     lung
detach("package:survival")

绘图系统

set.seed(1234) #make it happen....
x <- rnorm(100)
hist(x,freq=F)
curve(dnorm(x),add=T)  

h <- hist(x, plot=F)
ylim <- range(0, h$density, dnorm(0))
hist(x, freq=F, ylim=ylim)
curve(dnorm(x), add=T)  

数据集

向量

a <- c(1, 2, 5, 3, 6, -2, 4)
b <- c("one", "two", "three")
c <- c(TRUE, TRUE, TRUE, FALSE, TRUE, FALSE)

使用向量的下标

a <- c(1, 2, 5, 3, 6, -2, 4)
a[3]
## [1] 5
a[c(1, 3, 5)]
## [1] 1 5 6
a[2:6] 
## [1]  2  5  3  6 -2

建立字符型的向量:

a <- c("k", "j", "h", "a", "c", "m")
a[3]
## [1] "h"
a[c(1, 3, 5)]
## [1] "k" "h" "c"
a[2:6]
## [1] "j" "h" "a" "c" "m"

建立矩阵

y <- matrix(1:20, nrow=5, ncol=4)
y
##      [,1] [,2] [,3] [,4]
## [1,]    1    6   11   16
## [2,]    2    7   12   17
## [3,]    3    8   13   18
## [4,]    4    9   14   19
## [5,]    5   10   15   20
cells    <- c(1,26,24,68)
rnames   <- c("R1", "R2")
cnames   <- c("C1", "C2") 
mymatrix <- matrix(cells, nrow=2, ncol=2, byrow=TRUE,
                   dimnames=list(rnames, cnames)) 
mymatrix
##    C1 C2
## R1  1 26
## R2 24 68
mymatrix <- matrix(cells, nrow=2, ncol=2, byrow=FALSE,
                   dimnames=list(rnames, cnames))
mymatrix
##    C1 C2
## R1  1 24
## R2 26 68

使用矩阵的下标

x <- matrix(1:10, nrow=2)
x
##      [,1] [,2] [,3] [,4] [,5]
## [1,]    1    3    5    7    9
## [2,]    2    4    6    8   10
x[2,]
## [1]  2  4  6  8 10
x[,2]
## [1] 3 4
x[1,4]
## [1] 7
x[1, c(4,5)]
## [1] 7 9

数组

# - Creating an array
dim1 <- c("A1", "A2")
dim2 <- c("B1", "B2", "B3")
dim3 <- c("C1", "C2", "C3", "C4")
z <- array(1:24, c(2,3,4), dimnames=list(dim1, dim2, dim3))
z
## , , C1
## 
##    B1 B2 B3
## A1  1  3  5
## A2  2  4  6
## 
## , , C2
## 
##    B1 B2 B3
## A1  7  9 11
## A2  8 10 12
## 
## , , C3
## 
##    B1 B2 B3
## A1 13 15 17
## A2 14 16 18
## 
## , , C4
## 
##    B1 B2 B3
## A1 19 21 23
## A2 20 22 24

数据框

数据框可通过函数data.frame()创建:

病例数据

病例数据

# - Creating a dataframe
patientID <- c(1, 2, 3, 4)
age <- c(25, 34, 28, 52)
diabetes <- c("Type1", "Type2", "Type1", "Type1")
status <- c("Poor", "Improved", "Excellent", "Poor")
patientdata <- data.frame(patientID, age, diabetes, status)
patientdata
##   patientID age diabetes    status
## 1         1  25    Type1      Poor
## 2         2  34    Type2  Improved
## 3         3  28    Type1 Excellent
## 4         4  52    Type1      Poor
# - Specifying elements of a dataframe
patientdata[1:2]
##   patientID age
## 1         1  25
## 2         2  34
## 3         3  28
## 4         4  52
patientdata[c("diabetes","status")]
##   diabetes    status
## 1    Type1      Poor
## 2    Type2  Improved
## 3    Type1 Excellent
## 4    Type1      Poor
patientdata$age  
## [1] 25 34 28 52

因子

类别(名义型)变量和有序类别(有序型)变量在R中称为因子(factor)。

#  - Using factors
patientID <- c(1, 2, 3, 4)
age <- c(25, 34, 28, 52)
diabetes <- c("Type1", "Type2", "Type1", "Type1")
status <- c("Poor", "Improved", "Excellent", "Poor")
diabetes <- factor(diabetes)

要表示有序型变量,需要为函数factor()指定参数ordered=TRUE:

status <- factor(status, order=TRUE)
patientdata <- data.frame(patientID, age, diabetes, status)
str(patientdata)                               
## 'data.frame':    4 obs. of  4 variables:
##  $ patientID: num  1 2 3 4
##  $ age      : num  25 34 28 52
##  $ diabetes : Factor w/ 2 levels "Type1","Type2": 1 2 1 1
##  $ status   : Ord.factor w/ 3 levels "Excellent"<"Improved"<..: 3 2 1 3
summary(patientdata)
##    patientID         age        diabetes       status 
##  Min.   :1.00   Min.   :25.0   Type1:3   Excellent:1  
##  1st Qu.:1.75   1st Qu.:27.2   Type2:1   Improved :1  
##  Median :2.50   Median :31.0             Poor     :2  
##  Mean   :2.50   Mean   :34.8                          
##  3rd Qu.:3.25   3rd Qu.:38.5                          
##  Max.   :4.00   Max.   :52.0

列表

#  - Creating a list
g <- "My First List"
h <- c(25, 26, 18, 39)
j <- matrix(1:10, nrow=5)
k <- c("one", "two", "three")
mylist <- list(title=g, ages=h, j, k)
mylist
## $title
## [1] "My First List"
## 
## $ages
## [1] 25 26 18 39
## 
## [[3]]
##      [,1] [,2]
## [1,]    1    6
## [2,]    2    7
## [3,]    3    8
## [4,]    4    9
## [5,]    5   10
## 
## [[4]]
## [1] "one"   "two"   "three"

键盘编辑数据

# Entering data interactively from the keyboard
mydata <- data.frame(age=numeric(0),
                     gender=character(0), weight=numeric(0))
mydata <- edit(mydata)
mydata

直接嵌入数据集

# Entering data inline
mydatatxt <- "
age gender weight
25 m 166
30 f 115
18 f 120
"
mydata <- read.table(header=TRUE, text=mydatatxt)

读入数据

读入文本文件

使用read.table()从带分隔符的文本文件中导入数据

病例数据

病例数据

先建立如下的文件studentgrades.csv

StudentID,First,Last,Math,Science,Social Studies
011,Bob,Smith,90,80,67
012,Jane,Weary,75,,80
010,Dan,"Thornton, III",65,75,70
040,Mary,"O'Leary",90,95,92

注意:建立这个文件时,需要在最后一行回车保存!

这个文件可以从这里下载!

#grades <- read.table("http://statstudy.github.io/data/studentgrades.csv", header=TRUE,row.names="StudentID", sep=",")
grades <- read.table("studentgrades.csv",header=TRUE,row.names="StudentID", sep=",")
grades # print data frame
##    First          Last Math Science Social.Studies
## 11   Bob         Smith   90      80             67
## 12  Jane         Weary   75      NA             80
## 10   Dan Thornton, III   65      75             70
## 40  Mary       O'Leary   90      95             92
str(grades) # view data frame structure
## 'data.frame':    4 obs. of  5 variables:
##  $ First         : Factor w/ 4 levels "Bob","Dan","Jane",..: 1 3 2 4
##  $ Last          : Factor w/ 4 levels "O'Leary","Smith",..: 2 4 3 1
##  $ Math          : int  90 75 65 90
##  $ Science       : int  80 NA 75 95
##  $ Social.Studies: int  67 80 70 92

仔细观察读入数据文件的问题!

下面来纠正::

# Alternatively, import the data while specifying column classes 
grades <- read.table("studentgrades.csv", header=TRUE,row.names="StudentID", sep=",",colClasses=c("character", "character", "character","numeric", "numeric", "numeric"))
grades # print data frame
##     First          Last Math Science Social.Studies
## 011   Bob         Smith   90      80             67
## 012  Jane         Weary   75      NA             80
## 010   Dan Thornton, III   65      75             70
## 040  Mary       O'Leary   90      95             92
str(grades) # view data frame structure
## 'data.frame':    4 obs. of  5 variables:
##  $ First         : chr  "Bob" "Jane" "Dan" "Mary"
##  $ Last          : chr  "Smith" "Weary" "Thornton, III" "O'Leary"
##  $ Math          : num  90 75 65 90
##  $ Science       : num  80 NA 75 95
##  $ Social.Studies: num  67 80 70 92

导入Excel 数据

library(xlsx)
workbook <- "c:/myworkbook.xlsx"
mydataframe <- read.xlsx(workbook, 1)

导入XML 数据

参阅这里.

从网页抓取数据

使用RCurl包和XML包.

导入SPSS、SAS、Stata数据

参看foreign包和Hmisc包.

练习

install.packages("swirl")
library(swirl)
install_from_swirl("R Programming")
swirl()

返回课程主页