R for Data Science之——Relational data
R for Data Science之——Relational data
很多时候我们处理的不是单一数据集而是互相关联的相关数据集,他们之间以主键外键等想连接,我们需要将他们整合成一个完整数据集进行分析:
library(tidyverse)
library(nycflights13)
airlines
#> # A tibble: 16 x 2
#> carrier name
#> <chr> <chr>
#> 1 9E Endeavor Air Inc.
#> 2 AA American Airlines Inc.
#> 3 AS Alaska Airlines Inc.
#> 4 B6 JetBlue Airways
#> 5 DL Delta Air Lines Inc.
#> 6 EV ExpressJet Airlines Inc.
#> # ... with 10 more rows
airports
#> # A tibble: 1,458 x 8
#> faa name lat lon alt tz dst tzone
#> <chr> <chr> <dbl> <dbl> <int> <dbl> <chr> <chr>
#> 1 04G Lansdowne Airport 41.1 -80.6 1044 -5 A America/New…
#> 2 06A Moton Field Municipal … 32.5 -85.7 264 -6 A America/Chi…
#> 3 06C Schaumburg Regional 42.0 -88.1 801 -6 A America/Chi…
#> 4 06N Randall Airport 41.4 -74.4 523 -5 A America/New…
#> 5 09J Jekyll Island Airport 31.1 -81.4 11 -5 A America/New…
#> 6 0A9 Elizabethton Municipal… 36.4 -82.2 1593 -5 A America/New…
#> # ... with 1,452 more rows
planes
#> # A tibble: 3,322 x 9
#> tailnum year type manufacturer model engines seats speed engine
#> <chr> <int> <chr> <chr> <chr> <int> <int> <int> <chr>
#> 1 N10156 2004 Fixed win… EMBRAER EMB-1… 2 55 NA Turbo…
#> 2 N102UW 1998 Fixed win… AIRBUS INDUS… A320-… 2 182 NA Turbo…
#> 3 N103US 1999 Fixed win… AIRBUS INDUS… A320-… 2 182 NA Turbo…
#> 4 N104UW 1999 Fixed win… AIRBUS INDUS… A320-… 2 182 NA Turbo…
#> 5 N10575 2002 Fixed win… EMBRAER EMB-1… 2 55 NA Turbo…
#> 6 N105UW 1999 Fixed win… AIRBUS INDUS… A320-… 2 182 NA Turbo…
#> # ... with 3,316 more rows
weather
#> # A tibble: 26,115 x 15
#> origin year month day hour temp dewp humid wind_dir wind_speed
#> <chr> <dbl> <dbl> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 EWR 2013 1 1 1 39.0 26.1 59.4 270 10.4
#> 2 EWR 2013 1 1 2 39.0 27.0 61.6 250 8.06
#> 3 EWR 2013 1 1 3 39.0 28.0 64.4 240 11.5
#> 4 EWR 2013 1 1 4 39.9 28.0 62.2 250 12.7
#> 5 EWR 2013 1 1 5 39.0 28.0 64.4 260 12.7
#> 6 EWR 2013 1 1 6 37.9 28.0 67.2 240 11.5
#> # ... with 2.611e+04 more rows, and 5 more variables: wind_gust <dbl>,
#> # precip <dbl>, pressure <dbl>, visib <dbl>, time_hour <dttm>
以上数据集之间的关系为:
这其中planestailnum则为一个外键,对于主键来说,其在数据集中都是单一的:
planes %>%
count(tailnum) %>%
filter(n > 1)
#> # A tibble: 0 x 2
#> # ... with 2 variables: tailnum <chr>, n <int>
weather %>%
count(year, month, day, hour, origin) %>%
filter(n > 1)
#> # A tibble: 3 x 6
#> year month day hour origin n
#> <dbl> <dbl> <int> <int> <chr> <int>
#> 1 2013 11 3 1 EWR 2
#> 2 2013 11 3 1 JFK 2
#> 3 2013 11 3 1 LGA 2
但是flights数据集的主键是什么呢:
flights %>%
count(year, month, day, flight) %>%
filter(n > 1)
#> # A tibble: 29,768 x 5
#> year month day flight n
#> <int> <int> <int> <int> <int>
#> 1 2013 1 1 1 2
#> 2 2013 1 1 3 2
#> 3 2013 1 1 4 2
#> 4 2013 1 1 11 3
#> 5 2013 1 1 15 2
#> 6 2013 1 1 21 2
#> # ... with 2.976e+04 more rows
flights %>%
count(year, month, day, tailnum) %>%
filter(n > 1)
#> # A tibble: 64,928 x 5
#> year month day tailnum n
#> <int> <int> <int> <chr> <int>
#> 1 2013 1 1 N0EGMQ 2
#> 2 2013 1 1 N11189 2
#> 3 2013 1 1 N11536 2
#> 4 2013 1 1 N11544 3
#> 5 2013 1 1 N11551 2
#> 6 2013 1 1 N12540 2
#> # ... with 6.492e+04 more rows
可见这都不是主键,在这种情况下我们通常用mutate()和row_number()结合生成新的一列作为主键,这被称为surrogate key 代理主键。
Mutating joins
flights2 <- flights %>%
select(year:day, hour, origin, dest, tailnum, carrier)
flights2
#> # A tibble: 336,776 x 8
#> year month day hour origin dest tailnum carrier
#> <int> <int> <int> <dbl> <chr> <chr> <chr> <chr>
#> 1 2013 1 1 5 EWR IAH N14228 UA
#> 2 2013 1 1 5 LGA IAH N24211 UA
#> 3 2013 1 1 5 JFK MIA N619AA AA
#> 4 2013 1 1 5 JFK BQN N804JB B6
#> 5 2013 1 1 6 LGA ATL N668DN DL
#> 6 2013 1 1 5 EWR ORD N39463 UA
#> # ... with 3.368e+05 more rows
flights2 %>%
select(-origin, -dest) %>%
left_join(airlines, by = "carrier")
#> # A tibble: 336,776 x 7
#> year month day hour tailnum carrier name
#> <int> <int> <int> <dbl> <chr> <chr> <chr>
#> 1 2013 1 1 5 N14228 UA United Air Lines Inc.
#> 2 2013 1 1 5 N24211 UA United Air Lines Inc.
#> 3 2013 1 1 5 N619AA AA American Airlines Inc.
#> 4 2013 1 1 5 N804JB B6 JetBlue Airways
#> 5 2013 1 1 6 N668DN DL Delta Air Lines Inc.
#> 6 2013 1 1 5 N39463 UA United Air Lines Inc.
#> # ... with 3.368e+05 more rows
flights2 %>%
select(-origin, -dest) %>%
mutate(name = airlines$name[match(carrier, airlines$carrier)])
#> # A tibble: 336,776 x 7
#> year month day hour tailnum carrier name
#> <int> <int> <int> <dbl> <chr> <chr> <chr>
#> 1 2013 1 1 5 N14228 UA United Air Lines Inc.
#> 2 2013 1 1 5 N24211 UA United Air Lines Inc.
#> 3 2013 1 1 5 N619AA AA American Airlines Inc.
#> 4 2013 1 1 5 N804JB B6 JetBlue Airways
#> 5 2013 1 1 6 N668DN DL Delta Air Lines Inc.
#> 6 2013 1 1 5 N39463 UA United Air Lines Inc.
#> # ... with 3.368e+05 more rows
Inner join
x <- tribble(
~key, ~val_x,
1, "x1",
2, "x2",
3, "x3"
)
y <- tribble(
~key, ~val_y,
1, "y1",
2, "y2",
4, "y3"
)
x %>%
inner_join(y, by = "key")
#> # A tibble: 2 x 3
#> key val_x val_y
#> <dbl> <chr> <chr>
#> 1 1 x1 y1
#> 2 2 x2 y2
left join
x <- tribble(
~key, ~val_x,
1, "x1",
2, "x2",
2, "x3",
1, "x4"
)
y <- tribble(
~key, ~val_y,
1, "y1",
2, "y2"
)
left_join(x, y, by = "key")
#> # A tibble: 4 x 3
#> key val_x val_y
#> <dbl> <chr> <chr>
#> 1 1 x1 y1
#> 2 2 x2 y2
#> 3 2 x3 y2
#> 4 1 x4 y1
x <- tribble(
~key, ~val_x,
1, "x1",
2, "x2",
2, "x3",
3, "x4"
)
y <- tribble(
~key, ~val_y,
1, "y1",
2, "y2",
2, "y3",
3, "y4"
)
left_join(x, y, by = "key")
#> # A tibble: 6 x 3
#> key val_x val_y
#> <dbl> <chr> <chr>
#> 1 1 x1 y1
#> 2 2 x2 y2
#> 3 2 x2 y3
#> 4 2 x3 y2
#> 5 2 x3 y3
#> 6 3 x4 y4
如果省略by关键词,会根据所有共同的变量进行整合:
flights2 %>%
left_join(weather)
#> Joining, by = c("year", "month", "day", "hour", "origin")
#> # A tibble: 336,776 x 18
#> year month day hour origin dest tailnum carrier temp dewp humid
#> <dbl> <dbl> <int> <dbl> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 2013 1 1 5 EWR IAH N14228 UA 39.0 28.0 64.4
#> 2 2013 1 1 5 LGA IAH N24211 UA 39.9 25.0 54.8
#> 3 2013 1 1 5 JFK MIA N619AA AA 39.0 27.0 61.6
#> 4 2013 1 1 5 JFK BQN N804JB B6 39.0 27.0 61.6
#> 5 2013 1 1 6 LGA ATL N668DN DL 39.9 25.0 54.8
#> 6 2013 1 1 5 EWR ORD N39463 UA 39.0 28.0 64.4
#> # ... with 3.368e+05 more rows, and 7 more variables: wind_dir <dbl>,
#> # wind_speed <dbl>, wind_gust <dbl>, precip <dbl>, pressure <dbl>,
#> # visib <dbl>, time_hour <dttm>
flights2 %>%
left_join(planes, by = "tailnum")
#> # A tibble: 336,776 x 16
#> year.x month day hour origin dest tailnum carrier year.y type
#> <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <int> <chr>
#> 1 2013 1 1 5 EWR IAH N14228 UA 1999 Fixe…
#> 2 2013 1 1 5 LGA IAH N24211 UA 1998 Fixe…
#> 3 2013 1 1 5 JFK MIA N619AA AA 1990 Fixe…
#> 4 2013 1 1 5 JFK BQN N804JB B6 2012 Fixe…
#> 5 2013 1 1 6 LGA ATL N668DN DL 1991 Fixe…
#> 6 2013 1 1 5 EWR ORD N39463 UA 2012 Fixe…
#> # ... with 3.368e+05 more rows, and 6 more variables: manufacturer <chr>,
#> # model <chr>, engines <int>, seats <int>, speed <int>, engine <chr>
flights2 %>%
left_join(airports, c("dest" = "faa"))
#> # A tibble: 336,776 x 15
#> year month day hour origin dest tailnum carrier name lat lon
#> <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
#> 1 2013 1 1 5 EWR IAH N14228 UA Geor… 30.0 -95.3
#> 2 2013 1 1 5 LGA IAH N24211 UA Geor… 30.0 -95.3
#> 3 2013 1 1 5 JFK MIA N619AA AA Miam… 25.8 -80.3
#> 4 2013 1 1 5 JFK BQN N804JB B6 <NA> NA NA
#> 5 2013 1 1 6 LGA ATL N668DN DL Hart… 33.6 -84.4
#> 6 2013 1 1 5 EWR ORD N39463 UA Chic… 42.0 -87.9
#> # ... with 3.368e+05 more rows, and 4 more variables: alt <int>, tz <dbl>,
#> # dst <chr>, tzone <chr>
flights2 %>%
left_join(airports, c("origin" = "faa"))
#> # A tibble: 336,776 x 15
#> year month day hour origin dest tailnum carrier name lat lon
#> <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
#> 1 2013 1 1 5 EWR IAH N14228 UA Newa… 40.7 -74.2
#> 2 2013 1 1 5 LGA IAH N24211 UA La G… 40.8 -73.9
#> 3 2013 1 1 5 JFK MIA N619AA AA John… 40.6 -73.8
#> 4 2013 1 1 5 JFK BQN N804JB B6 John… 40.6 -73.8
#> 5 2013 1 1 6 LGA ATL N668DN DL La G… 40.8 -73.9
#> 6 2013 1 1 5 EWR ORD N39463 UA Newa… 40.7 -74.2
#> # ... with 3.368e+05 more rows, and 4 more variables: alt <int>, tz <dbl>,
#> # dst <chr>, tzone <chr>
Filtering joins
- semi_join(x, y)保存所有x中与y匹配的观测值
- anti_join(x, y)抛弃所有x中与y匹配的观测值
top_dest <- flights %>%
count(dest, sort = TRUE) %>%
head(10)
top_dest
#> # A tibble: 10 x 2
#> dest n
#> <chr> <int>
#> 1 ORD 17283
#> 2 ATL 17215
#> 3 LAX 16174
#> 4 BOS 15508
#> 5 MCO 14082
#> 6 CLT 14064
#> # ... with 4 more rows
flights %>%
filter(dest %in% top_dest$dest)
#> # A tibble: 141,145 x 19
#> year month day dep_time sched_dep_time dep_delay arr_time
#> <int> <int> <int> <int> <int> <dbl> <int>
#> 1 2013 1 1 542 540 2 923
#> 2 2013 1 1 554 600 -6 812
#> 3 2013 1 1 554 558 -4 740
#> 4 2013 1 1 555 600 -5 913
#> 5 2013 1 1 557 600 -3 838
#> 6 2013 1 1 558 600 -2 753
#> # ... with 1.411e+05 more rows, and 12 more variables:
#> # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>,
#> # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
#> # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>%
semi_join(top_dest)
#> Joining, by = "dest"
#> # A tibble: 141,145 x 19
#> year month day dep_time sched_dep_time dep_delay arr_time
#> <int> <int> <int> <int> <int> <dbl> <int>
#> 1 2013 1 1 542 540 2 923
#> 2 2013 1 1 554 600 -6 812
#> 3 2013 1 1 554 558 -4 740
#> 4 2013 1 1 555 600 -5 913
#> 5 2013 1 1 557 600 -3 838
#> 6 2013 1 1 558 600 -2 753
#> # ... with 1.411e+05 more rows, and 12 more variables:
#> # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>,
#> # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
#> # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>%
anti_join(planes, by = "tailnum") %>%
count(tailnum, sort = TRUE)
#> # A tibble: 722 x 2
#> tailnum n
#> <chr> <int>
#> 1 <NA> 2512
#> 2 N725MQ 575
#> 3 N722MQ 513
#> 4 N723MQ 507
#> 5 N713MQ 483
#> 6 N735MQ 396
#> # ... with 716 more rows
Set operations
- intersect(x, y):返回x和y中共有的值
- union(x, y):返回x和y所有的值
- setdiff(x, y):返回x有但y没有的值
df1 <- tribble(
~x, ~y,
1, 1,
2, 1
)
df2 <- tribble(
~x, ~y,
1, 1,
1, 2
)
intersect(df1, df2)
#> # A tibble: 1 x 2
#> x y
#> <dbl> <dbl>
#> 1 1 1
# Note that we get 3 rows, not 4
union(df1, df2)
#> # A tibble: 3 x 2
#> x y
#> <dbl> <dbl>
#> 1 1 2
#> 2 2 1
#> 3 1 1
setdiff(df1, df2)
#> # A tibble: 1 x 2
#> x y
#> <dbl> <dbl>
#> 1 2 1
setdiff(df2, df1)
#> # A tibble: 1 x 2
#> x y
#> <dbl> <dbl>
#> 1 1 2
所有代码已上传GITHUB点此进入