Aggregating Recruitment Data by Time Period • c4cp

This vignette demonstrates how to use the count_by_period function to aggregate data by specified time periods (week, month, or quarter) using a BigQuery dataset connected via dbplyr.

Prerequisites

Ensure you have the following packages installed and loaded:

install.packages(c("dplyr", "dbplyr", "bigrquery", "DBI", "rlang", "devtools"))
devtools::install_github("Analyticsphere/c4cp")

# Load the necessary libraries:
library(dplyr)
library(dbplyr)
library(bigrquery)
library(DBI)
library(c4cp)

Establishing a Connection to BigQuery

First, establish a connection to your BigQuery project and dataset.

# Define your project and dataset IDs
project_id <- "nih-nci-dceg-connect-stg-5519"
dataset_id <- "FlatConnect"

# Establish the connection
con <- dbConnect(
  bigrquery::bigquery(),
  project = project_id,
  dataset = dataset_id,
  use_legacy_sql = FALSE
)

# Reference the 'participants_JP' table in the 'FlatConnect' dataset
your_table <- tbl(con, "FlatConnect.participants_JP")

Aggregating Counts by Week

Aggregate the number of records by week, with the week starting on Monday (default setting).

# Aggregate counts by week
weekly_counts <- count_by_period(
  tbl = your_table,
  date_col = d_471593703,
  period = "week",
  date_type = "string"
)

# Collect the results into R
weekly_counts_local <- weekly_counts %>% collect()

# View the aggregated weekly counts
print(weekly_counts_local)

Example Output:

# A tibble: 109 × 2
   period_start        num_records
   <dttm>                    <int>
 1 2022-05-30 00:00:00           8
 2 2022-06-06 00:00:00          15
 3 2022-06-13 00:00:00          31
 4 2022-06-20 00:00:00           7
 5 2022-06-27 00:00:00          14
 6 2022-07-04 00:00:00          30
 7 2022-07-11 00:00:00          26
 8 2022-07-18 00:00:00          10
 9 2022-07-25 00:00:00          12
10 2022-08-01 00:00:00           2
# ℹ 99 more rows
# ℹ Use `print(n = ...)` to see more rows

Show the SQL Query that was Generated

weekly_counts %>% show_query()

<SQL>
SELECT `period_start`, count(*) AS `num_records`
FROM (
  SELECT
    `d_471593703`,
    TIMESTAMP_TRUNC(CAST(`d_471593703` AS TIMESTAMP), WEEK(MONDAY)) AS `period_start`
  FROM `FlatConnect.participants_JP`
  WHERE (NOT((`d_471593703` IS NULL)))
) `q01`
GROUP BY `period_start`
ORDER BY `period_start`

Aggregating Counts by Month

Aggregate the number of records by month.

# Aggregate counts by month
monthly_counts <- count_by_period(
  tbl = your_table,
  date_col = d_471593703,
  period = "month"
)

# Collect the results into R
monthly_counts_local <- monthly_counts %>% collect()

# View the aggregated monthly counts
print(monthly_counts_local)

Example Output:

# A tibble: 29 × 2
   period_start        num_records
   <dttm>                    <int>
 1 2022-06-01 00:00:00          70
 2 2022-07-01 00:00:00          83
 3 2022-08-01 00:00:00          29
 4 2022-09-01 00:00:00          48
 5 2022-10-01 00:00:00          65
 6 2022-11-01 00:00:00         171
 7 2022-12-01 00:00:00         192
 8 2023-01-01 00:00:00          20
 9 2023-02-01 00:00:00          31
10 2023-03-01 00:00:00          93
# ℹ 19 more rows
# ℹ Use `print(n = ...)` to see more rows

Aggregating Counts by Quarter

Aggregate the number of records by quarter.

# Aggregate counts by quarter
quarterly_counts <- count_by_period(
  tbl = your_table,
  date_col = d_471593703,
  period = "quarter"
)

# Collect the results into R
quarterly_counts_local <- quarterly_counts %>% collect()

# View the aggregated quarterly counts
print(quarterly_counts_local)

Example Output:

# A tibble: 11 × 2
   period_start        num_records
   <dttm>                    <int>
 1 2022-04-01 00:00:00          70
 2 2022-07-01 00:00:00         160
 3 2022-10-01 00:00:00         428
 4 2023-01-01 00:00:00         144
 5 2023-04-01 00:00:00         141
 6 2023-07-01 00:00:00         114
 7 2023-10-01 00:00:00         191
 8 2024-01-01 00:00:00         172
 9 2024-04-01 00:00:00         115
10 2024-07-01 00:00:00         121
11 2024-10-01 00:00:00           2

Aggregating Counts by Week Starting on Sunday

Customize the aggregation to have weeks start on Sunday instead of the default Monday.

# Aggregate counts by week starting on Sunday
weekly_sunday_counts <- count_by_period(
  tbl = your_table,
  date_col = d_471593703,
  period = "week",
  week_start = "SUNDAY"
)

# Collect the results into R
weekly_sunday_counts_local <- weekly_sunday_counts %>% collect()

# View the aggregated weekly counts starting on Sunday
print(weekly_sunday_counts_local)

Example Output

# A tibble: 109 × 2
   period_start        num_records
   <dttm>                    <int>
 1 2022-05-29 00:00:00           8
 2 2022-06-05 00:00:00          15
 3 2022-06-12 00:00:00          31
 4 2022-06-19 00:00:00           7
 5 2022-06-26 00:00:00          14
 6 2022-07-03 00:00:00          30
 7 2022-07-10 00:00:00          26
 8 2022-07-17 00:00:00          10
 9 2022-07-24 00:00:00          12
10 2022-07-31 00:00:00           2
# ℹ 99 more rows
# ℹ Use `print(n = ...)` to see more rows

Aggregate counts by week starting on Sunday, grouping by site and birth month

 weekly_counts_by_site_birth_month <- your_table %>%
     mutate(
       RcrtSI_TypeTime_v1r0 = d_471593703,
       birth_month = d_564964481,
       site = case_when(
        d_827220437 == "531629870" ~ "HP",
        d_827220437 == "548392715" ~ "HF",
        d_827220437 == "125001209" ~ "KPCO",
        d_827220437 == "327912200" ~ "KPGA",
        d_827220437 == "300267574" ~ "KPHI",
        d_827220437 == "452412599" ~ "KPNW",
        d_827220437 == "303349821" ~ "MARSH",
        d_827220437 == "657167265" ~ "SANF",
        d_827220437 == "809703864" ~ "UCHI",
        d_827220437 == "472940358" ~ "BSWH",
        TRUE ~ "Other" 
    )) %>%
    count_by_period(
       date_col = "RcrtSI_TypeTime_v1r0",
       period = "week",
       week_start = "SUNDAY",
       date_type = "string",
       group_vars = c("site", "birth_month")
     ) %>% 
     show_query() %>%
     collect() %>%
     print()

Example Output

<SQL>
SELECT `period_start`, `site`, `birth_month`, count(*) AS `num_records`
FROM (
  SELECT
    `q01`.*,
    TIMESTAMP_TRUNC(CAST(`RcrtSI_TypeTime_v1r0` AS TIMESTAMP), WEEK(SUNDAY)) AS `period_start`
  FROM (
    SELECT
      `d_471593703` AS `RcrtSI_TypeTime_v1r0`,
      CASE
        WHEN (`d_827220437` = '531629870') THEN 'HP'
        WHEN (`d_827220437` = '548392715') THEN 'HF'
        WHEN (`d_827220437` = '125001209') THEN 'KPCO'
        WHEN (`d_827220437` = '327912200') THEN 'KPGA'
        WHEN (`d_827220437` = '300267574') THEN 'KPHI'
        WHEN (`d_827220437` = '452412599') THEN 'KPNW'
        WHEN (`d_827220437` = '303349821') THEN 'MARSH'
        WHEN (`d_827220437` = '657167265') THEN 'SANF'
        WHEN (`d_827220437` = '809703864') THEN 'UCHI'
        WHEN (`d_827220437` = '472940358') THEN 'BSWH'
        ELSE 'Other'
      END AS `site`,
      `d_564964481` AS `birth_month`
    FROM `FlatConnect.participants_JP`
    WHERE (NOT((`d_471593703` IS NULL)))
  ) `q01`
) `q01`
GROUP BY `period_start`, `site`, `birth_month`
ORDER BY `period_start`
Job complete
Billed: 10.49 MB
Downloading first chunk of data.
First chunk includes all requested rows.
# A tibble: 747 × 4
   period_start        site  birth_month num_records
   <dttm>              <chr> <chr>             <int>
 1 2022-05-29 00:00:00 HP    03                    1
 2 2022-05-29 00:00:00 MARSH 05                    1
 3 2022-05-29 00:00:00 MARSH 01                    2
 4 2022-05-29 00:00:00 HP    11                    1
 5 2022-05-29 00:00:00 MARSH 08                    1
 6 2022-05-29 00:00:00 HP    04                    1
 7 2022-05-29 00:00:00 HP    10                    1
 8 2022-06-05 00:00:00 SANF  NA                    1
 9 2022-06-05 00:00:00 SANF  01                    2
10 2022-06-05 00:00:00 HF    07                    1
# ℹ 737 more rows
# ℹ Use `print(n = ...)` to see more rows