What the function does (Overview)
generate_fake_with_privacy()
creates a synthetic copy of
your data.
It then handles sensitive columns by name.
Level presets
level | category_mode | column_mode | numeric_mode |
---|---|---|---|
low | preserve | keep | range |
medium | generic | generic | range |
high | generic | generic | distribution |
-
sensitive_detect
auto-finds common PII by column name. -
sensitive_strategy
chooses how to treat those columns:"fake"
(tokenize) or"drop"
(remove). - You can also list sensitive columns yourself with
sensitive = c("id","email", ...)
.
Levels and strategies
library(FakeDataR)
df <- data.frame(
id = 1:50,
email = sprintf("u%02d@x.com", 1:50),
phone = sprintf("555-01%02d", 1:50),
dept = sample(c("A","B","C"), 50, TRUE),
spend = round(runif(50, 10, 200), 2),
check.names = FALSE
)
# Auto-detect sensitive columns and fake them
# Strategy: fake sensitive fields (default)
fake_low <- generate_fake_with_privacy(
data = df, n = 60, level = "low", seed = 1,
sensitive_detect = TRUE, sensitive_strategy = "fake",
normalize = TRUE
)
# Auto-detect and drop sensitive columns
# Strategy: drop sensitive fields
fake_drop <- generate_fake_with_privacy(
data = df, n = 60, level = "medium", seed = 1,
sensitive_detect = TRUE, sensitive_strategy = "drop",
normalize = TRUE
)
names(fake_low)
#> [1] "id" "email" "phone" "dept" "spend"
names(fake_drop)
#> [1] "var4" "var5"
# Inspect privacy metadata
attr(fake_low, "sensitive_columns")
#> [1] "id" "email" "phone"
attr(fake_drop, "dropped_columns")
#> [1] "id" "email" "phone"
attr(fake_low, "name_map")
#> id email phone dept spend
#> "id" "email" "phone" "dept" "spend"
Explicit ‘sensitive’ vs auto-detect
You can fully control what’s sensitive. Here we turn off auto-detect and list columns ourselves:
fake_explicit <- generate_fake_with_privacy(
data = df, n = 60, seed = 1,
sensitive = c("id","email","phone"),
sensitive_detect = FALSE,
sensitive_strategy = "fake",
normalize = TRUE
)
names(fake_explicit)
#> [1] "id" "email" "phone" "dept" "spend"
attr(fake_explicit, "sensitive_columns")
#> [1] "id" "email" "phone"
Extending detection with your own patterns
# A broad, configurable pattern set
sensitive_patterns <- c(
# direct IDs / names
"^id$", "employee[_-]?id", "user(name|[_-]?id)?$", "full[_-]?name", "first[_-]?name", "last[_-]?name",
# contact
"email|e-mail", "phone|tel|mobile", "fax",
# address / geo
"address|street|road|avenue|apt|unit|suite|zip|postal|postcode|city|state|province|country",
"lat(itude)?|lon(gitude)?|gps",
# government IDs (international sampling)
"RegId|ssn|sin|nin|aadhaar|aadhar|bvn|curp|dni|ced(ul|)+a|cpf|pan\\b|tin\\b|ein\\b|pesel|nin\\b",
# licenses / travel docs
"passport|visa|license|licence|driver|dl\\b|vin|plate",
# finance / payments
"iban|swift|bic|routing|sort[_-]?code|account|acct|bank",
"credit|debit|card|cvv|cvc|pan[_-]?number",
# auth / secrets / device
"password|pass|pwd|pin|otp|secret|token|api[_-]?key|auth|bearer|session|cookie",
"ip(_address)?|mac(_address)?|imei|imsi|serial|device|udid|android[_-]?id|idfa|gaid",
# medical / patient
"mrn|nhs|medicare|medicaid|patient|diagnosis",
# birthdays
"dob|date[_-]?of[_-]?birth|birth(day|date)",
# education
"student[_-]?id"
)
rx <- paste0("(?i)(", paste(sensitive_patterns, collapse = "|"), ")")
sens_cols <- names(df)[grepl(rx, names(df))]
sens_cols
#> [1] "id" "email" "phone"
sens_cols <- names(df)[grepl(rx, names(df))]
fake_custom_detect <- generate_fake_with_privacy(
data = df, n = 60, seed = 1,
sensitive = unique(c(sens_cols, "email")),
sensitive_detect = FALSE,
sensitive_strategy = "fake",
normalize = TRUE
)
attr(fake_custom_detect, "sensitive_columns")
#> [1] "id" "email" "phone"
Validation
v1 <- validate_fake(df, fake_low)
head(v1, 5)
#> column class_original class_fake class_match na_prop_original na_prop_fake
#> 1 id integer integer TRUE 0 0
#> 2 email character character TRUE 0 0
#> 3 phone character character TRUE 0 0
#> 4 dept character character TRUE 0 0
#> 5 spend numeric numeric TRUE 0 0
#> na_match blank_prop_original blank_prop_fake blank_match
#> 1 TRUE NA NA NA
#> 2 TRUE 0 0 TRUE
#> 3 TRUE 0 0 TRUE
#> 4 TRUE 0 0 TRUE
#> 5 TRUE NA NA NA
#> range_within_original
#> 1 FALSE
#> 2 NA
#> 3 NA
#> 4 NA
#> 5 TRUE