-
Notifications
You must be signed in to change notification settings - Fork 0
/
db_23_solutions.R
75 lines (59 loc) · 1.72 KB
/
db_23_solutions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# 1. Write code to create a partitioned dataset with the `flights` table,
# partitioned by `origin`.
# - Hint: The dataset only contains flights departing from New York City airports.
con_rw <- DBI::dbConnect(
duckdb::duckdb(),
dbdir = "flights.duckdb",
read_only = FALSE
)
DBI::dbExecute(con_rw, "DROP TABLE IF EXISTS flights;")
flights_duckdb <- dplyr::copy_to(
con_rw,
nycflights13::flights,
name = "flights",
temporary = FALSE,
overwrite = TRUE
)
dplyr::tbl(con_rw, "flights") |>
dplyr::distinct(origin)
# Method 1 ---------------------------------------------------------------------
# DB-agnositic
ewr <-
dplyr::tbl(con_rw, "flights") |>
dplyr::filter(origin == "EWR") |>
dplyr::collect()
lga <-
dplyr::tbl(con_rw, "flights") |>
dplyr::filter(origin == "LGA") |>
dplyr::collect()
jfk <-
dplyr::tbl(con_rw, "flights") |>
dplyr::filter(origin == "JFK") |>
dplyr::collect()
purrr::walk2(
list(ewr, lga, jfk),
list("EWR", "LGA", "JFK"),
function(x, y) {
if (!fs::dir_exists("manual-partition-flights")) {
fs::dir_create("manual-partition-flights")
}
out_path <- fs::dir_create(
fs::path("manual-partition-flights", paste0("origin=", y))
)
duckplyr::df_to_parquet(
x,
fs::path(out_path, "part-0.parquet")
)
}
)
# Method 2 ---------------------------------------------------------------------
# If on DuckDB
DBI::dbExecute(
con_rw,
"COPY flights TO 'flights-partion' (FORMAT PARQUET, PARTITION_BY origin);"
)
# Method 3 ---------------------------------------------------------------------
# If on DuckDB
dplyr::tbl(con_rw, "flights") |>
arrow::to_arrow() |>
arrow::write_dataset("flights-partition-arrow", partitioning = "origin")