updated script logic and generated sample data

This commit is contained in:
CameronCSS 2025-12-09 16:37:53 -07:00
parent 5e911b4690
commit 125aa5e122
3 changed files with 338 additions and 0 deletions

View file

@ -0,0 +1,114 @@
from faker import Faker
from dotenv import load_dotenv
from datetime import datetime
import os
import random
import pandas as pd
import boto3
# ---- Setup ----
fake = Faker()
load_dotenv()
s3 = boto3.resource(
"s3",
endpoint_url=os.getenv("STORAGE_ENDPOINT"),
aws_access_key_id=os.getenv("STORAGE_ACCESS_KEY"),
aws_secret_access_key=os.getenv("STORAGE_SECRET_KEY")
)
bucket_name = os.getenv("STORAGE_BUCKET")
customers_key = "DataLab/customers/customers.csv"
accounts_s3_key = "DataLab/accounts/accounts.csv"
# ---- Ensure local data folder exists ----
os.makedirs("../Data", exist_ok=True)
# ---- Download customers.csv from S3 ----
local_customers_file = "../Data/customers.csv"
try:
s3.Bucket(bucket_name).download_file(customers_key, local_customers_file)
print("Downloaded customers.csv from S3.")
except Exception as e:
print("ERROR: Could not download customers.csv:", e)
raise SystemExit()
# ---- Load customers DataFrame ----
customers_df = pd.read_csv(local_customers_file)
# Convert customer_since to actual date objects
customers_df["customer_since"] = pd.to_datetime(customers_df["customer_since"]).dt.date
# ---- Helper Functions ----
def generate_account_id(branch_id):
"""Generate realistic branch-coded account IDs (1112 digits)."""
branch_part = str(branch_id).zfill(3) # 3-digit branch ID
random_part = str(random.randint(10**8, 10**9 - 1)) # 89 random digits
return branch_part + random_part
def generate_account_number():
"""Generate realistic 11-digit bank account numbers."""
return str(random.randint(10**10, (10**11) - 1))
def assign_account_types():
"""
Assign 12 accounts per customer using realistic rules:
- ~50% Checking Only
- ~20% Savings Only
- ~30% Both
"""
roll = random.random()
if roll < 0.50:
return ["Checking"]
elif roll < 0.70:
return ["Savings"]
else:
return ["Checking", "Savings"]
def balance_for_type(account_type):
"""Give realistic account balances."""
if account_type == "Checking":
return round(random.uniform(50, 7000), 2)
else: # Savings
return round(random.uniform(200, 25000), 2)
# ---- Generate accounts ----
accounts = []
for _, row in customers_df.iterrows():
customer_id = row["customer_id"]
customer_since = row["customer_since"]
home_branch_id = row["home_branch_id"]
# Determine which account types this customer owns
account_types = assign_account_types()
for acct_type in account_types:
accounts.append({
"account_id": generate_account_id(home_branch_id),
"account_number": generate_account_number(),
"customer_id": customer_id,
"account_type": acct_type,
"open_date": fake.date_between(
start_date=customer_since, end_date=datetime.today().date()
),
"balance": balance_for_type(acct_type),
"branch_id": home_branch_id
})
# ---- Convert to DataFrame ----
accounts_df = pd.DataFrame(accounts)
# ---- Save locally ----
local_accounts_file = "../Data/accounts.csv"
accounts_df.to_csv(local_accounts_file, index=False)
print("Generated accounts.csv locally.")
# ---- Upload to S3 ----
try:
s3.Bucket(bucket_name).upload_file(local_accounts_file, accounts_s3_key)
print(f"Uploaded accounts.csv to s3://{bucket_name}/{accounts_s3_key}")
except Exception as e:
print("ERROR: Could not upload accounts.csv to S3:", e)

View file

@ -0,0 +1,109 @@
from faker import Faker
from dotenv import load_dotenv
import os
import pandas as pd
import boto3
import random
from datetime import datetime
fake = Faker()
# ---- Load env ----
load_dotenv()
# ---- Hetzner S3 setup ----
s3 = boto3.resource(
"s3",
endpoint_url=os.getenv("STORAGE_ENDPOINT"),
aws_access_key_id=os.getenv("STORAGE_ACCESS_KEY"),
aws_secret_access_key=os.getenv("STORAGE_SECRET_KEY")
)
bucket_name = os.getenv("STORAGE_BUCKET")
customers_s3_key = "DataLab/customers/customers.csv"
branches_s3_key = "DataLab/branches/branches.csv"
# ---- Load branches from S3 ----
branches_local = "../Data/branches.csv"
s3.Bucket(bucket_name).download_file(branches_s3_key, branches_local)
branches = pd.read_csv(branches_local)
# ---- Helper functions ----
def realistic_credit_score():
"""Normal distribution around 680."""
score = int(random.gauss(680, 60))
return max(300, min(score, 850))
def realistic_income():
brackets = [
(20000, 40000),
(40000, 70000),
(70000, 120000),
(120000, 200000)
]
low, high = random.choice(brackets)
return random.randint(low, high)
def realistic_employment():
return random.choices(
["Employed", "Self-Employed", "Unemployed", "Student", "Retired"],
weights=[50, 15, 10, 15, 10]
)[0]
def realistic_contact():
return random.choice(["Email", "Phone", "SMS"])
# ---- Generate Customers ----
customers = []
start_id = 100000 # Realistic banking customer IDs
for i in range(50):
first = fake.first_name()
last = fake.last_name()
dob = fake.date_between(start_date="-80y", end_date="-18y")
age = (datetime.now().date() - dob).days // 365
income = realistic_income()
credit = realistic_credit_score()
customers.append({
"customer_id": start_id + i,
"first_name": first,
"last_name": last,
"full_name": f"{first} {last}",
"email": f"{first.lower()}.{last.lower()}@{fake.free_email_domain()}",
"phone": fake.phone_number(),
"date_of_birth": dob,
"age": age,
"gender": random.choice(["Male", "Female", "Other"]),
"street_address": fake.street_address(),
"city": fake.city(),
"state": fake.state_abbr(),
"zip_code": fake.zipcode(),
"home_branch_id": random.choice(branches["branch_id"]),
"customer_since": fake.date_between(start_date="-10y", end_date="today"),
"employment_status": realistic_employment(),
"annual_income": income,
"credit_score": credit,
"preferred_contact_method": realistic_contact(),
"is_high_value_customer": income > 120000 or credit > 750,
"age_group": (
"18-25" if age < 26 else
"26-35" if age < 36 else
"36-50" if age < 51 else
"51-65" if age < 66 else
"66+"
)
})
df = pd.DataFrame(customers)
# ---- Save locally ----
local_file = "../Data/customers.csv"
df.to_csv(local_file, index=False)
print("Generated realistic customers.")
# ---- Upload to S3 ----
s3.Bucket(bucket_name).upload_file(local_file, customers_s3_key)
print(f"Uploaded customers.csv to s3://{bucket_name}/{customers_s3_key}")

View file

@ -0,0 +1,115 @@
from faker import Faker
from dotenv import load_dotenv
from datetime import datetime
import os
import random
import pandas as pd
import boto3
# ---- Setup ----
fake = Faker()
load_dotenv()
s3 = boto3.resource(
"s3",
endpoint_url=os.getenv("STORAGE_ENDPOINT"),
aws_access_key_id=os.getenv("STORAGE_ACCESS_KEY"),
aws_secret_access_key=os.getenv("STORAGE_SECRET_KEY")
)
bucket_name = os.getenv("STORAGE_BUCKET")
accounts_key = "DataLab/accounts/accounts.csv"
transactions_s3_key = "DataLab/transactions/transactions.csv"
# ---- Ensure local data folder exists ----
os.makedirs("../Data", exist_ok=True)
# ---- Download accounts.csv from S3 ----
local_accounts_file = "../Data/accounts.csv"
try:
s3.Bucket(bucket_name).download_file(accounts_key, local_accounts_file)
print("Downloaded accounts.csv from S3.")
except Exception as e:
print("ERROR: Could not download accounts.csv:", e)
raise SystemExit()
# ---- Load accounts DataFrame ----
accounts_df = pd.read_csv(local_accounts_file)
# ---- Sample vendors ----
vendors = ["Amazon", "Walmart", "Target", "Starbucks", "Apple", "Netflix", "Uber", "Lyft", "BestBuy", "Costco"]
# ---- Helper Functions ----
def generate_transaction_id(account_id, idx):
"""Generate a unique transaction ID combining account ID and index."""
return f"{account_id}{str(idx).zfill(5)}"
def generate_transaction(account):
"""Generate a realistic transaction for a given account."""
t_type = random.choices(
["Deposit", "Withdrawal", "Payment", "Transfer"],
weights=[0.4, 0.3, 0.2, 0.1], k=1
)[0]
transaction_data = {
"transaction_id": None, # fill later
"account_id": account['account_id'],
"branch_id": None,
"transaction_type": t_type,
"amount": 0,
"date": fake.date_between(start_date=pd.to_datetime(account['open_date']), end_date=datetime.today()),
"balance_after": 0,
"vendor": None,
"transaction_location": None
}
if t_type in ["Deposit", "Withdrawal"]:
# Pick one of the branches for deposit/withdrawal
transaction_data["branch_id"] = account['branch_id']
amount = round(random.uniform(50, 7000), 2) if t_type == "Withdrawal" else round(random.uniform(20, 10000), 2)
if t_type == "Withdrawal":
amount = min(amount, account['balance'])
account['balance'] -= amount
else:
account['balance'] += amount
transaction_data["amount"] = amount
transaction_data["balance_after"] = round(account['balance'], 2)
transaction_data["transaction_location"] = f"Branch {account['branch_id']}"
else: # Payment or Transfer
transaction_data["branch_id"] = None
transaction_data["vendor"] = random.choice(vendors)
amount = round(random.uniform(5, 1000), 2)
account['balance'] = max(account['balance'] - amount, 0)
transaction_data["amount"] = amount
transaction_data["balance_after"] = round(account['balance'], 2)
transaction_data["transaction_location"] = "POS / Online"
return transaction_data
# ---- Generate transactions ----
transactions = []
idx = 1
for _, account in accounts_df.iterrows():
account_transactions_count = random.randint(5, 20)
for _ in range(account_transactions_count):
txn = generate_transaction(account)
txn['transaction_id'] = generate_transaction_id(account['account_id'], idx)
transactions.append(txn)
idx += 1
# ---- Convert to DataFrame ----
transactions_df = pd.DataFrame(transactions)
# ---- Save locally ----
local_transactions_file = "../Data/transactions.csv"
transactions_df.to_csv(local_transactions_file, index=False)
print("Generated transactions.csv locally with realistic branch/vendor data.")
# ---- Upload to S3 ----
try:
s3.Bucket(bucket_name).upload_file(local_transactions_file, transactions_s3_key)
print(f"Uploaded transactions.csv to s3://{bucket_name}/{transactions_s3_key}")
except Exception as e:
print("ERROR: Could not upload transactions.csv to S3:", e)