R for Data Science之——Relational data

R for Data Science之——Relational data

很多时候我们处理的不是单一数据集而是互相关联的相关数据集,他们之间以主键外键等想连接,我们需要将他们整合成一个完整数据集进行分析:

library(tidyverse)
library(nycflights13)

airlines
#> # A tibble: 16 x 2
#>   carrier name                    
#>   <chr>   <chr>                   
#> 1 9E      Endeavor Air Inc.       
#> 2 AA      American Airlines Inc.  
#> 3 AS      Alaska Airlines Inc.    
#> 4 B6      JetBlue Airways         
#> 5 DL      Delta Air Lines Inc.    
#> 6 EV      ExpressJet Airlines Inc.
#> # ... with 10 more rows

airports
#> # A tibble: 1,458 x 8
#>   faa   name                      lat   lon   alt    tz dst   tzone       
#>   <chr> <chr>                   <dbl> <dbl> <int> <dbl> <chr> <chr>       
#> 1 04G   Lansdowne Airport        41.1 -80.6  1044    -5 A     America/New…
#> 2 06A   Moton Field Municipal …  32.5 -85.7   264    -6 A     America/Chi…
#> 3 06C   Schaumburg Regional      42.0 -88.1   801    -6 A     America/Chi…
#> 4 06N   Randall Airport          41.4 -74.4   523    -5 A     America/New…
#> 5 09J   Jekyll Island Airport    31.1 -81.4    11    -5 A     America/New…
#> 6 0A9   Elizabethton Municipal…  36.4 -82.2  1593    -5 A     America/New…
#> # ... with 1,452 more rows

planes
#> # A tibble: 3,322 x 9
#>   tailnum  year type       manufacturer  model  engines seats speed engine
#>   <chr>   <int> <chr>      <chr>         <chr>    <int> <int> <int> <chr> 
#> 1 N10156   2004 Fixed win… EMBRAER       EMB-1…       2    55    NA Turbo…
#> 2 N102UW   1998 Fixed win… AIRBUS INDUS… A320-…       2   182    NA Turbo…
#> 3 N103US   1999 Fixed win… AIRBUS INDUS… A320-…       2   182    NA Turbo…
#> 4 N104UW   1999 Fixed win… AIRBUS INDUS… A320-…       2   182    NA Turbo…
#> 5 N10575   2002 Fixed win… EMBRAER       EMB-1…       2    55    NA Turbo…
#> 6 N105UW   1999 Fixed win… AIRBUS INDUS… A320-…       2   182    NA Turbo…
#> # ... with 3,316 more rows

weather
#> # A tibble: 26,115 x 15
#>   origin  year month   day  hour  temp  dewp humid wind_dir wind_speed
#>   <chr>  <dbl> <dbl> <int> <int> <dbl> <dbl> <dbl>    <dbl>      <dbl>
#> 1 EWR     2013     1     1     1  39.0  26.1  59.4      270      10.4 
#> 2 EWR     2013     1     1     2  39.0  27.0  61.6      250       8.06
#> 3 EWR     2013     1     1     3  39.0  28.0  64.4      240      11.5 
#> 4 EWR     2013     1     1     4  39.9  28.0  62.2      250      12.7 
#> 5 EWR     2013     1     1     5  39.0  28.0  64.4      260      12.7 
#> 6 EWR     2013     1     1     6  37.9  28.0  67.2      240      11.5 
#> # ... with 2.611e+04 more rows, and 5 more variables: wind_gust <dbl>,
#> #   precip <dbl>, pressure <dbl>, visib <dbl>, time_hour <dttm>

以上数据集之间的关系为:

R for Data Science之——Relational data
这其中planestailnumflightstailnum就是一个主键,而flightstailnum则为一个外键,对于主键来说,其在数据集中都是单一的:

planes %>% 
  count(tailnum) %>% 
  filter(n > 1)
#> # A tibble: 0 x 2
#> # ... with 2 variables: tailnum <chr>, n <int>

weather %>% 
  count(year, month, day, hour, origin) %>% 
  filter(n > 1)
#> # A tibble: 3 x 6
#>    year month   day  hour origin     n
#>   <dbl> <dbl> <int> <int> <chr>  <int>
#> 1  2013    11     3     1 EWR        2
#> 2  2013    11     3     1 JFK        2
#> 3  2013    11     3     1 LGA        2

但是flights数据集的主键是什么呢:

flights %>% 
  count(year, month, day, flight) %>% 
  filter(n > 1)
#> # A tibble: 29,768 x 5
#>    year month   day flight     n
#>   <int> <int> <int>  <int> <int>
#> 1  2013     1     1      1     2
#> 2  2013     1     1      3     2
#> 3  2013     1     1      4     2
#> 4  2013     1     1     11     3
#> 5  2013     1     1     15     2
#> 6  2013     1     1     21     2
#> # ... with 2.976e+04 more rows

flights %>% 
  count(year, month, day, tailnum) %>% 
  filter(n > 1)
#> # A tibble: 64,928 x 5
#>    year month   day tailnum     n
#>   <int> <int> <int> <chr>   <int>
#> 1  2013     1     1 N0EGMQ      2
#> 2  2013     1     1 N11189      2
#> 3  2013     1     1 N11536      2
#> 4  2013     1     1 N11544      3
#> 5  2013     1     1 N11551      2
#> 6  2013     1     1 N12540      2
#> # ... with 6.492e+04 more rows

可见这都不是主键,在这种情况下我们通常用mutate()和row_number()结合生成新的一列作为主键,这被称为surrogate key 代理主键。

Mutating joins

flights2 <- flights %>% 
  select(year:day, hour, origin, dest, tailnum, carrier)
flights2
#> # A tibble: 336,776 x 8
#>    year month   day  hour origin dest  tailnum carrier
#>   <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>  
#> 1  2013     1     1     5 EWR    IAH   N14228  UA     
#> 2  2013     1     1     5 LGA    IAH   N24211  UA     
#> 3  2013     1     1     5 JFK    MIA   N619AA  AA     
#> 4  2013     1     1     5 JFK    BQN   N804JB  B6     
#> 5  2013     1     1     6 LGA    ATL   N668DN  DL     
#> 6  2013     1     1     5 EWR    ORD   N39463  UA     
#> # ... with 3.368e+05 more rows

flights2 %>%
  select(-origin, -dest) %>% 
  left_join(airlines, by = "carrier")
#> # A tibble: 336,776 x 7
#>    year month   day  hour tailnum carrier name                  
#>   <int> <int> <int> <dbl> <chr>   <chr>   <chr>                 
#> 1  2013     1     1     5 N14228  UA      United Air Lines Inc. 
#> 2  2013     1     1     5 N24211  UA      United Air Lines Inc. 
#> 3  2013     1     1     5 N619AA  AA      American Airlines Inc.
#> 4  2013     1     1     5 N804JB  B6      JetBlue Airways       
#> 5  2013     1     1     6 N668DN  DL      Delta Air Lines Inc.  
#> 6  2013     1     1     5 N39463  UA      United Air Lines Inc. 
#> # ... with 3.368e+05 more rows

flights2 %>%
  select(-origin, -dest) %>% 
  mutate(name = airlines$name[match(carrier, airlines$carrier)])
#> # A tibble: 336,776 x 7
#>    year month   day  hour tailnum carrier name                  
#>   <int> <int> <int> <dbl> <chr>   <chr>   <chr>                 
#> 1  2013     1     1     5 N14228  UA      United Air Lines Inc. 
#> 2  2013     1     1     5 N24211  UA      United Air Lines Inc. 
#> 3  2013     1     1     5 N619AA  AA      American Airlines Inc.
#> 4  2013     1     1     5 N804JB  B6      JetBlue Airways       
#> 5  2013     1     1     6 N668DN  DL      Delta Air Lines Inc.  
#> 6  2013     1     1     5 N39463  UA      United Air Lines Inc. 
#> # ... with 3.368e+05 more rows

Inner join

x <- tribble(
  ~key, ~val_x,
     1, "x1",
     2, "x2",
     3, "x3"
)
y <- tribble(
  ~key, ~val_y,
     1, "y1",
     2, "y2",
     4, "y3"
)

x %>% 
  inner_join(y, by = "key")
#> # A tibble: 2 x 3
#>     key val_x val_y
#>   <dbl> <chr> <chr>
#> 1     1 x1    y1   
#> 2     2 x2    y2

left join

x <- tribble(
  ~key, ~val_x,
     1, "x1",
     2, "x2",
     2, "x3",
     1, "x4"
)
y <- tribble(
  ~key, ~val_y,
     1, "y1",
     2, "y2"
)
left_join(x, y, by = "key")
#> # A tibble: 4 x 3
#>     key val_x val_y
#>   <dbl> <chr> <chr>
#> 1     1 x1    y1   
#> 2     2 x2    y2   
#> 3     2 x3    y2   
#> 4     1 x4    y1

x <- tribble(
  ~key, ~val_x,
     1, "x1",
     2, "x2",
     2, "x3",
     3, "x4"
)
y <- tribble(
  ~key, ~val_y,
     1, "y1",
     2, "y2",
     2, "y3",
     3, "y4"
)
left_join(x, y, by = "key")
#> # A tibble: 6 x 3
#>     key val_x val_y
#>   <dbl> <chr> <chr>
#> 1     1 x1    y1   
#> 2     2 x2    y2   
#> 3     2 x2    y3   
#> 4     2 x3    y2   
#> 5     2 x3    y3   
#> 6     3 x4    y4

如果省略by关键词,会根据所有共同的变量进行整合:

flights2 %>% 
  left_join(weather)
#> Joining, by = c("year", "month", "day", "hour", "origin")
#> # A tibble: 336,776 x 18
#>    year month   day  hour origin dest  tailnum carrier  temp  dewp humid
#>   <dbl> <dbl> <int> <dbl> <chr>  <chr> <chr>   <chr>   <dbl> <dbl> <dbl>
#> 1  2013     1     1     5 EWR    IAH   N14228  UA       39.0  28.0  64.4
#> 2  2013     1     1     5 LGA    IAH   N24211  UA       39.9  25.0  54.8
#> 3  2013     1     1     5 JFK    MIA   N619AA  AA       39.0  27.0  61.6
#> 4  2013     1     1     5 JFK    BQN   N804JB  B6       39.0  27.0  61.6
#> 5  2013     1     1     6 LGA    ATL   N668DN  DL       39.9  25.0  54.8
#> 6  2013     1     1     5 EWR    ORD   N39463  UA       39.0  28.0  64.4
#> # ... with 3.368e+05 more rows, and 7 more variables: wind_dir <dbl>,
#> #   wind_speed <dbl>, wind_gust <dbl>, precip <dbl>, pressure <dbl>,
#> #   visib <dbl>, time_hour <dttm>

flights2 %>% 
  left_join(planes, by = "tailnum")
#> # A tibble: 336,776 x 16
#>   year.x month   day  hour origin dest  tailnum carrier year.y type 
#>    <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>    <int> <chr>
#> 1   2013     1     1     5 EWR    IAH   N14228  UA        1999 Fixe…
#> 2   2013     1     1     5 LGA    IAH   N24211  UA        1998 Fixe…
#> 3   2013     1     1     5 JFK    MIA   N619AA  AA        1990 Fixe…
#> 4   2013     1     1     5 JFK    BQN   N804JB  B6        2012 Fixe…
#> 5   2013     1     1     6 LGA    ATL   N668DN  DL        1991 Fixe…
#> 6   2013     1     1     5 EWR    ORD   N39463  UA        2012 Fixe…
#> # ... with 3.368e+05 more rows, and 6 more variables: manufacturer <chr>,
#> #   model <chr>, engines <int>, seats <int>, speed <int>, engine <chr>

flights2 %>% 
  left_join(airports, c("dest" = "faa"))
#> # A tibble: 336,776 x 15
#>    year month   day  hour origin dest  tailnum carrier name    lat   lon
#>   <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>   <chr> <dbl> <dbl>
#> 1  2013     1     1     5 EWR    IAH   N14228  UA      Geor…  30.0 -95.3
#> 2  2013     1     1     5 LGA    IAH   N24211  UA      Geor…  30.0 -95.3
#> 3  2013     1     1     5 JFK    MIA   N619AA  AA      Miam…  25.8 -80.3
#> 4  2013     1     1     5 JFK    BQN   N804JB  B6      <NA>   NA    NA  
#> 5  2013     1     1     6 LGA    ATL   N668DN  DL      Hart…  33.6 -84.4
#> 6  2013     1     1     5 EWR    ORD   N39463  UA      Chic…  42.0 -87.9
#> # ... with 3.368e+05 more rows, and 4 more variables: alt <int>, tz <dbl>,
#> #   dst <chr>, tzone <chr>

flights2 %>% 
  left_join(airports, c("origin" = "faa"))
#> # A tibble: 336,776 x 15
#>    year month   day  hour origin dest  tailnum carrier name    lat   lon
#>   <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>   <chr> <dbl> <dbl>
#> 1  2013     1     1     5 EWR    IAH   N14228  UA      Newa…  40.7 -74.2
#> 2  2013     1     1     5 LGA    IAH   N24211  UA      La G…  40.8 -73.9
#> 3  2013     1     1     5 JFK    MIA   N619AA  AA      John…  40.6 -73.8
#> 4  2013     1     1     5 JFK    BQN   N804JB  B6      John…  40.6 -73.8
#> 5  2013     1     1     6 LGA    ATL   N668DN  DL      La G…  40.8 -73.9
#> 6  2013     1     1     5 EWR    ORD   N39463  UA      Newa…  40.7 -74.2
#> # ... with 3.368e+05 more rows, and 4 more variables: alt <int>, tz <dbl>,
#> #   dst <chr>, tzone <chr>

Filtering joins

  • semi_join(x, y)保存所有x中与y匹配的观测值
  • anti_join(x, y)抛弃所有x中与y匹配的观测值
top_dest <- flights %>%
  count(dest, sort = TRUE) %>%
  head(10)
top_dest
#> # A tibble: 10 x 2
#>   dest      n
#>   <chr> <int>
#> 1 ORD   17283
#> 2 ATL   17215
#> 3 LAX   16174
#> 4 BOS   15508
#> 5 MCO   14082
#> 6 CLT   14064
#> # ... with 4 more rows

flights %>% 
  filter(dest %in% top_dest$dest)
#> # A tibble: 141,145 x 19
#>    year month   day dep_time sched_dep_time dep_delay arr_time
#>   <int> <int> <int>    <int>          <int>     <dbl>    <int>
#> 1  2013     1     1      542            540         2      923
#> 2  2013     1     1      554            600        -6      812
#> 3  2013     1     1      554            558        -4      740
#> 4  2013     1     1      555            600        -5      913
#> 5  2013     1     1      557            600        -3      838
#> 6  2013     1     1      558            600        -2      753
#> # ... with 1.411e+05 more rows, and 12 more variables:
#> #   sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>,
#> #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
#> #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

flights %>% 
  semi_join(top_dest)
#> Joining, by = "dest"
#> # A tibble: 141,145 x 19
#>    year month   day dep_time sched_dep_time dep_delay arr_time
#>   <int> <int> <int>    <int>          <int>     <dbl>    <int>
#> 1  2013     1     1      542            540         2      923
#> 2  2013     1     1      554            600        -6      812
#> 3  2013     1     1      554            558        -4      740
#> 4  2013     1     1      555            600        -5      913
#> 5  2013     1     1      557            600        -3      838
#> 6  2013     1     1      558            600        -2      753
#> # ... with 1.411e+05 more rows, and 12 more variables:
#> #   sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>,
#> #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
#> #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>%
  anti_join(planes, by = "tailnum") %>%
  count(tailnum, sort = TRUE)
#> # A tibble: 722 x 2
#>   tailnum     n
#>   <chr>   <int>
#> 1 <NA>     2512
#> 2 N725MQ    575
#> 3 N722MQ    513
#> 4 N723MQ    507
#> 5 N713MQ    483
#> 6 N735MQ    396
#> # ... with 716 more rows

Set operations

  • intersect(x, y):返回x和y*有的值
  • union(x, y):返回x和y所有的值
  • setdiff(x, y):返回x有但y没有的值
df1 <- tribble(
  ~x, ~y,
   1,  1,
   2,  1
)
df2 <- tribble(
  ~x, ~y,
   1,  1,
   1,  2
)

intersect(df1, df2)
#> # A tibble: 1 x 2
#>       x     y
#>   <dbl> <dbl>
#> 1     1     1

# Note that we get 3 rows, not 4
union(df1, df2)
#> # A tibble: 3 x 2
#>       x     y
#>   <dbl> <dbl>
#> 1     1     2
#> 2     2     1
#> 3     1     1

setdiff(df1, df2)
#> # A tibble: 1 x 2
#>       x     y
#>   <dbl> <dbl>
#> 1     2     1

setdiff(df2, df1)
#> # A tibble: 1 x 2
#>       x     y
#>   <dbl> <dbl>
#> 1     1     2

所有代码已上传GITHUB点此进入