Data_Lab/Scripts/Generate_accounts.py
2025-12-09 16:37:53 -07:00

114 lines
3.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from faker import Faker
from dotenv import load_dotenv
from datetime import datetime
import os
import random
import pandas as pd
import boto3
# ---- Setup ----
fake = Faker()
load_dotenv()
s3 = boto3.resource(
"s3",
endpoint_url=os.getenv("STORAGE_ENDPOINT"),
aws_access_key_id=os.getenv("STORAGE_ACCESS_KEY"),
aws_secret_access_key=os.getenv("STORAGE_SECRET_KEY")
)
bucket_name = os.getenv("STORAGE_BUCKET")
customers_key = "DataLab/customers/customers.csv"
accounts_s3_key = "DataLab/accounts/accounts.csv"
# ---- Ensure local data folder exists ----
os.makedirs("../Data", exist_ok=True)
# ---- Download customers.csv from S3 ----
local_customers_file = "../Data/customers.csv"
try:
s3.Bucket(bucket_name).download_file(customers_key, local_customers_file)
print("Downloaded customers.csv from S3.")
except Exception as e:
print("ERROR: Could not download customers.csv:", e)
raise SystemExit()
# ---- Load customers DataFrame ----
customers_df = pd.read_csv(local_customers_file)
# Convert customer_since to actual date objects
customers_df["customer_since"] = pd.to_datetime(customers_df["customer_since"]).dt.date
# ---- Helper Functions ----
def generate_account_id(branch_id):
"""Generate realistic branch-coded account IDs (1112 digits)."""
branch_part = str(branch_id).zfill(3) # 3-digit branch ID
random_part = str(random.randint(10**8, 10**9 - 1)) # 89 random digits
return branch_part + random_part
def generate_account_number():
"""Generate realistic 11-digit bank account numbers."""
return str(random.randint(10**10, (10**11) - 1))
def assign_account_types():
"""
Assign 12 accounts per customer using realistic rules:
- ~50% Checking Only
- ~20% Savings Only
- ~30% Both
"""
roll = random.random()
if roll < 0.50:
return ["Checking"]
elif roll < 0.70:
return ["Savings"]
else:
return ["Checking", "Savings"]
def balance_for_type(account_type):
"""Give realistic account balances."""
if account_type == "Checking":
return round(random.uniform(50, 7000), 2)
else: # Savings
return round(random.uniform(200, 25000), 2)
# ---- Generate accounts ----
accounts = []
for _, row in customers_df.iterrows():
customer_id = row["customer_id"]
customer_since = row["customer_since"]
home_branch_id = row["home_branch_id"]
# Determine which account types this customer owns
account_types = assign_account_types()
for acct_type in account_types:
accounts.append({
"account_id": generate_account_id(home_branch_id),
"account_number": generate_account_number(),
"customer_id": customer_id,
"account_type": acct_type,
"open_date": fake.date_between(
start_date=customer_since, end_date=datetime.today().date()
),
"balance": balance_for_type(acct_type),
"branch_id": home_branch_id
})
# ---- Convert to DataFrame ----
accounts_df = pd.DataFrame(accounts)
# ---- Save locally ----
local_accounts_file = "../Data/accounts.csv"
accounts_df.to_csv(local_accounts_file, index=False)
print("Generated accounts.csv locally.")
# ---- Upload to S3 ----
try:
s3.Bucket(bucket_name).upload_file(local_accounts_file, accounts_s3_key)
print(f"Uploaded accounts.csv to s3://{bucket_name}/{accounts_s3_key}")
except Exception as e:
print("ERROR: Could not upload accounts.csv to S3:", e)