Aggregating Recruitment Data by Time Period
Source:vignettes/count_by_period_vignette.Rmd
count_by_period_vignette.Rmd
This vignette demonstrates how to use the
count_by_period
function to aggregate data by specified
time periods (week, month, or quarter) using a BigQuery dataset
connected via dbplyr.
Establishing a Connection to BigQuery
First, establish a connection to your BigQuery project and dataset.
# Define your project and dataset IDs
project_id <- "nih-nci-dceg-connect-stg-5519"
dataset_id <- "FlatConnect"
# Establish the connection
con <- dbConnect(
bigrquery::bigquery(),
project = project_id,
dataset = dataset_id,
use_legacy_sql = FALSE
)
# Reference the 'participants_JP' table in the 'FlatConnect' dataset
your_table <- tbl(con, "FlatConnect.participants_JP")
Aggregating Counts by Week
Aggregate the number of records by week, with the week starting on Monday (default setting).
# Aggregate counts by week
weekly_counts <- count_by_period(
tbl = your_table,
date_col = d_471593703,
period = "week",
date_type = "string"
)
# Collect the results into R
weekly_counts_local <- weekly_counts %>% collect()
# View the aggregated weekly counts
print(weekly_counts_local)
Example Output:
# A tibble: 109 × 2
period_start num_records
<dttm> <int>
1 2022-05-30 00:00:00 8
2 2022-06-06 00:00:00 15
3 2022-06-13 00:00:00 31
4 2022-06-20 00:00:00 7
5 2022-06-27 00:00:00 14
6 2022-07-04 00:00:00 30
7 2022-07-11 00:00:00 26
8 2022-07-18 00:00:00 10
9 2022-07-25 00:00:00 12
10 2022-08-01 00:00:00 2
# ℹ 99 more rows
# ℹ Use `print(n = ...)` to see more rows
Aggregating Counts by Month
Aggregate the number of records by month.
# Aggregate counts by month
monthly_counts <- count_by_period(
tbl = your_table,
date_col = d_471593703,
period = "month"
)
# Collect the results into R
monthly_counts_local <- monthly_counts %>% collect()
# View the aggregated monthly counts
print(monthly_counts_local)
Example Output:
# A tibble: 29 × 2
period_start num_records
<dttm> <int>
1 2022-06-01 00:00:00 70
2 2022-07-01 00:00:00 83
3 2022-08-01 00:00:00 29
4 2022-09-01 00:00:00 48
5 2022-10-01 00:00:00 65
6 2022-11-01 00:00:00 171
7 2022-12-01 00:00:00 192
8 2023-01-01 00:00:00 20
9 2023-02-01 00:00:00 31
10 2023-03-01 00:00:00 93
# ℹ 19 more rows
# ℹ Use `print(n = ...)` to see more rows
Aggregating Counts by Quarter
Aggregate the number of records by quarter.
# Aggregate counts by quarter
quarterly_counts <- count_by_period(
tbl = your_table,
date_col = d_471593703,
period = "quarter"
)
# Collect the results into R
quarterly_counts_local <- quarterly_counts %>% collect()
# View the aggregated quarterly counts
print(quarterly_counts_local)
Example Output:
# A tibble: 11 × 2
period_start num_records
<dttm> <int>
1 2022-04-01 00:00:00 70
2 2022-07-01 00:00:00 160
3 2022-10-01 00:00:00 428
4 2023-01-01 00:00:00 144
5 2023-04-01 00:00:00 141
6 2023-07-01 00:00:00 114
7 2023-10-01 00:00:00 191
8 2024-01-01 00:00:00 172
9 2024-04-01 00:00:00 115
10 2024-07-01 00:00:00 121
11 2024-10-01 00:00:00 2
Aggregating Counts by Week Starting on Sunday
Customize the aggregation to have weeks start on Sunday instead of the default Monday.
# Aggregate counts by week starting on Sunday
weekly_sunday_counts <- count_by_period(
tbl = your_table,
date_col = d_471593703,
period = "week",
week_start = "SUNDAY"
)
# Collect the results into R
weekly_sunday_counts_local <- weekly_sunday_counts %>% collect()
# View the aggregated weekly counts starting on Sunday
print(weekly_sunday_counts_local)
Example Output
# A tibble: 109 × 2
period_start num_records
<dttm> <int>
1 2022-05-29 00:00:00 8
2 2022-06-05 00:00:00 15
3 2022-06-12 00:00:00 31
4 2022-06-19 00:00:00 7
5 2022-06-26 00:00:00 14
6 2022-07-03 00:00:00 30
7 2022-07-10 00:00:00 26
8 2022-07-17 00:00:00 10
9 2022-07-24 00:00:00 12
10 2022-07-31 00:00:00 2
# ℹ 99 more rows
# ℹ Use `print(n = ...)` to see more rows
Aggregate counts by week starting on Sunday, grouping by site and birth month
weekly_counts_by_site_birth_month <- your_table %>%
mutate(
RcrtSI_TypeTime_v1r0 = d_471593703,
birth_month = d_564964481,
site = case_when(
d_827220437 == "531629870" ~ "HP",
d_827220437 == "548392715" ~ "HF",
d_827220437 == "125001209" ~ "KPCO",
d_827220437 == "327912200" ~ "KPGA",
d_827220437 == "300267574" ~ "KPHI",
d_827220437 == "452412599" ~ "KPNW",
d_827220437 == "303349821" ~ "MARSH",
d_827220437 == "657167265" ~ "SANF",
d_827220437 == "809703864" ~ "UCHI",
d_827220437 == "472940358" ~ "BSWH",
TRUE ~ "Other"
)) %>%
count_by_period(
date_col = "RcrtSI_TypeTime_v1r0",
period = "week",
week_start = "SUNDAY",
date_type = "string",
group_vars = c("site", "birth_month")
) %>%
show_query() %>%
collect() %>%
print()
Example Output
<SQL>
SELECT `period_start`, `site`, `birth_month`, count(*) AS `num_records`
FROM (
SELECT
`q01`.*,
TIMESTAMP_TRUNC(CAST(`RcrtSI_TypeTime_v1r0` AS TIMESTAMP), WEEK(SUNDAY)) AS `period_start`
FROM (
SELECT
`d_471593703` AS `RcrtSI_TypeTime_v1r0`,
CASE
WHEN (`d_827220437` = '531629870') THEN 'HP'
WHEN (`d_827220437` = '548392715') THEN 'HF'
WHEN (`d_827220437` = '125001209') THEN 'KPCO'
WHEN (`d_827220437` = '327912200') THEN 'KPGA'
WHEN (`d_827220437` = '300267574') THEN 'KPHI'
WHEN (`d_827220437` = '452412599') THEN 'KPNW'
WHEN (`d_827220437` = '303349821') THEN 'MARSH'
WHEN (`d_827220437` = '657167265') THEN 'SANF'
WHEN (`d_827220437` = '809703864') THEN 'UCHI'
WHEN (`d_827220437` = '472940358') THEN 'BSWH'
ELSE 'Other'
END AS `site`,
`d_564964481` AS `birth_month`
FROM `FlatConnect.participants_JP`
WHERE (NOT((`d_471593703` IS NULL)))
) `q01`
) `q01`
GROUP BY `period_start`, `site`, `birth_month`
ORDER BY `period_start`
Job complete
Billed: 10.49 MB
Downloading first chunk of data.
First chunk includes all requested rows.
# A tibble: 747 × 4
period_start site birth_month num_records
<dttm> <chr> <chr> <int>
1 2022-05-29 00:00:00 HP 03 1
2 2022-05-29 00:00:00 MARSH 05 1
3 2022-05-29 00:00:00 MARSH 01 2
4 2022-05-29 00:00:00 HP 11 1
5 2022-05-29 00:00:00 MARSH 08 1
6 2022-05-29 00:00:00 HP 04 1
7 2022-05-29 00:00:00 HP 10 1
8 2022-06-05 00:00:00 SANF NA 1
9 2022-06-05 00:00:00 SANF 01 2
10 2022-06-05 00:00:00 HF 07 1
# ℹ 737 more rows
# ℹ Use `print(n = ...)` to see more rows