From 125aa5e122670122dd0f08a13ee15383810807d2 Mon Sep 17 00:00:00 2001 From: CameronCSS Date: Tue, 9 Dec 2025 16:37:53 -0700 Subject: [PATCH] updated script logic and generated sample data --- Scripts/Generate_accounts.py | 114 ++++++++++++++++++++++++++++++ Scripts/Generate_customers.py | 109 +++++++++++++++++++++++++++++ Scripts/Generate_transactions.py | 115 +++++++++++++++++++++++++++++++ 3 files changed, 338 insertions(+) create mode 100644 Scripts/Generate_accounts.py create mode 100644 Scripts/Generate_customers.py create mode 100644 Scripts/Generate_transactions.py diff --git a/Scripts/Generate_accounts.py b/Scripts/Generate_accounts.py new file mode 100644 index 0000000..909427e --- /dev/null +++ b/Scripts/Generate_accounts.py @@ -0,0 +1,114 @@ +from faker import Faker +from dotenv import load_dotenv +from datetime import datetime +import os +import random +import pandas as pd +import boto3 + +# ---- Setup ---- +fake = Faker() +load_dotenv() + +s3 = boto3.resource( + "s3", + endpoint_url=os.getenv("STORAGE_ENDPOINT"), + aws_access_key_id=os.getenv("STORAGE_ACCESS_KEY"), + aws_secret_access_key=os.getenv("STORAGE_SECRET_KEY") +) + +bucket_name = os.getenv("STORAGE_BUCKET") +customers_key = "DataLab/customers/customers.csv" +accounts_s3_key = "DataLab/accounts/accounts.csv" + +# ---- Ensure local data folder exists ---- +os.makedirs("../Data", exist_ok=True) + +# ---- Download customers.csv from S3 ---- +local_customers_file = "../Data/customers.csv" +try: + s3.Bucket(bucket_name).download_file(customers_key, local_customers_file) + print("Downloaded customers.csv from S3.") +except Exception as e: + print("ERROR: Could not download customers.csv:", e) + raise SystemExit() + +# ---- Load customers DataFrame ---- +customers_df = pd.read_csv(local_customers_file) + +# Convert customer_since to actual date objects +customers_df["customer_since"] = pd.to_datetime(customers_df["customer_since"]).dt.date + +# ---- Helper Functions ---- + +def generate_account_id(branch_id): + """Generate realistic branch-coded account IDs (11–12 digits).""" + branch_part = str(branch_id).zfill(3) # 3-digit branch ID + random_part = str(random.randint(10**8, 10**9 - 1)) # 8–9 random digits + return branch_part + random_part + +def generate_account_number(): + """Generate realistic 11-digit bank account numbers.""" + return str(random.randint(10**10, (10**11) - 1)) + +def assign_account_types(): + """ + Assign 1–2 accounts per customer using realistic rules: + - ~50% Checking Only + - ~20% Savings Only + - ~30% Both + """ + roll = random.random() + + if roll < 0.50: + return ["Checking"] + elif roll < 0.70: + return ["Savings"] + else: + return ["Checking", "Savings"] + +def balance_for_type(account_type): + """Give realistic account balances.""" + if account_type == "Checking": + return round(random.uniform(50, 7000), 2) + else: # Savings + return round(random.uniform(200, 25000), 2) + +# ---- Generate accounts ---- +accounts = [] + +for _, row in customers_df.iterrows(): + customer_id = row["customer_id"] + customer_since = row["customer_since"] + home_branch_id = row["home_branch_id"] + + # Determine which account types this customer owns + account_types = assign_account_types() + + for acct_type in account_types: + accounts.append({ + "account_id": generate_account_id(home_branch_id), + "account_number": generate_account_number(), + "customer_id": customer_id, + "account_type": acct_type, + "open_date": fake.date_between( + start_date=customer_since, end_date=datetime.today().date() + ), + "balance": balance_for_type(acct_type), + "branch_id": home_branch_id + }) + +# ---- Convert to DataFrame ---- +accounts_df = pd.DataFrame(accounts) + +# ---- Save locally ---- +local_accounts_file = "../Data/accounts.csv" +accounts_df.to_csv(local_accounts_file, index=False) +print("Generated accounts.csv locally.") + +# ---- Upload to S3 ---- +try: + s3.Bucket(bucket_name).upload_file(local_accounts_file, accounts_s3_key) + print(f"Uploaded accounts.csv to s3://{bucket_name}/{accounts_s3_key}") +except Exception as e: + print("ERROR: Could not upload accounts.csv to S3:", e) diff --git a/Scripts/Generate_customers.py b/Scripts/Generate_customers.py new file mode 100644 index 0000000..a1fa2d6 --- /dev/null +++ b/Scripts/Generate_customers.py @@ -0,0 +1,109 @@ +from faker import Faker +from dotenv import load_dotenv +import os +import pandas as pd +import boto3 +import random +from datetime import datetime + +fake = Faker() + +# ---- Load env ---- +load_dotenv() + +# ---- Hetzner S3 setup ---- +s3 = boto3.resource( + "s3", + endpoint_url=os.getenv("STORAGE_ENDPOINT"), + aws_access_key_id=os.getenv("STORAGE_ACCESS_KEY"), + aws_secret_access_key=os.getenv("STORAGE_SECRET_KEY") +) + +bucket_name = os.getenv("STORAGE_BUCKET") +customers_s3_key = "DataLab/customers/customers.csv" +branches_s3_key = "DataLab/branches/branches.csv" + +# ---- Load branches from S3 ---- +branches_local = "../Data/branches.csv" +s3.Bucket(bucket_name).download_file(branches_s3_key, branches_local) +branches = pd.read_csv(branches_local) + +# ---- Helper functions ---- +def realistic_credit_score(): + """Normal distribution around 680.""" + score = int(random.gauss(680, 60)) + return max(300, min(score, 850)) + +def realistic_income(): + brackets = [ + (20000, 40000), + (40000, 70000), + (70000, 120000), + (120000, 200000) + ] + low, high = random.choice(brackets) + return random.randint(low, high) + +def realistic_employment(): + return random.choices( + ["Employed", "Self-Employed", "Unemployed", "Student", "Retired"], + weights=[50, 15, 10, 15, 10] + )[0] + +def realistic_contact(): + return random.choice(["Email", "Phone", "SMS"]) + +# ---- Generate Customers ---- +customers = [] +start_id = 100000 # Realistic banking customer IDs + +for i in range(50): + first = fake.first_name() + last = fake.last_name() + + dob = fake.date_between(start_date="-80y", end_date="-18y") + age = (datetime.now().date() - dob).days // 365 + + income = realistic_income() + credit = realistic_credit_score() + + customers.append({ + "customer_id": start_id + i, + "first_name": first, + "last_name": last, + "full_name": f"{first} {last}", + "email": f"{first.lower()}.{last.lower()}@{fake.free_email_domain()}", + "phone": fake.phone_number(), + "date_of_birth": dob, + "age": age, + "gender": random.choice(["Male", "Female", "Other"]), + "street_address": fake.street_address(), + "city": fake.city(), + "state": fake.state_abbr(), + "zip_code": fake.zipcode(), + "home_branch_id": random.choice(branches["branch_id"]), + "customer_since": fake.date_between(start_date="-10y", end_date="today"), + "employment_status": realistic_employment(), + "annual_income": income, + "credit_score": credit, + "preferred_contact_method": realistic_contact(), + "is_high_value_customer": income > 120000 or credit > 750, + "age_group": ( + "18-25" if age < 26 else + "26-35" if age < 36 else + "36-50" if age < 51 else + "51-65" if age < 66 else + "66+" + ) + }) + +df = pd.DataFrame(customers) + +# ---- Save locally ---- +local_file = "../Data/customers.csv" +df.to_csv(local_file, index=False) +print("Generated realistic customers.") + +# ---- Upload to S3 ---- +s3.Bucket(bucket_name).upload_file(local_file, customers_s3_key) +print(f"Uploaded customers.csv to s3://{bucket_name}/{customers_s3_key}") diff --git a/Scripts/Generate_transactions.py b/Scripts/Generate_transactions.py new file mode 100644 index 0000000..f3e3712 --- /dev/null +++ b/Scripts/Generate_transactions.py @@ -0,0 +1,115 @@ +from faker import Faker +from dotenv import load_dotenv +from datetime import datetime +import os +import random +import pandas as pd +import boto3 + +# ---- Setup ---- +fake = Faker() +load_dotenv() + +s3 = boto3.resource( + "s3", + endpoint_url=os.getenv("STORAGE_ENDPOINT"), + aws_access_key_id=os.getenv("STORAGE_ACCESS_KEY"), + aws_secret_access_key=os.getenv("STORAGE_SECRET_KEY") +) + +bucket_name = os.getenv("STORAGE_BUCKET") +accounts_key = "DataLab/accounts/accounts.csv" +transactions_s3_key = "DataLab/transactions/transactions.csv" + +# ---- Ensure local data folder exists ---- +os.makedirs("../Data", exist_ok=True) + +# ---- Download accounts.csv from S3 ---- +local_accounts_file = "../Data/accounts.csv" +try: + s3.Bucket(bucket_name).download_file(accounts_key, local_accounts_file) + print("Downloaded accounts.csv from S3.") +except Exception as e: + print("ERROR: Could not download accounts.csv:", e) + raise SystemExit() + +# ---- Load accounts DataFrame ---- +accounts_df = pd.read_csv(local_accounts_file) + +# ---- Sample vendors ---- +vendors = ["Amazon", "Walmart", "Target", "Starbucks", "Apple", "Netflix", "Uber", "Lyft", "BestBuy", "Costco"] + +# ---- Helper Functions ---- +def generate_transaction_id(account_id, idx): + """Generate a unique transaction ID combining account ID and index.""" + return f"{account_id}{str(idx).zfill(5)}" + +def generate_transaction(account): + """Generate a realistic transaction for a given account.""" + t_type = random.choices( + ["Deposit", "Withdrawal", "Payment", "Transfer"], + weights=[0.4, 0.3, 0.2, 0.1], k=1 + )[0] + + transaction_data = { + "transaction_id": None, # fill later + "account_id": account['account_id'], + "branch_id": None, + "transaction_type": t_type, + "amount": 0, + "date": fake.date_between(start_date=pd.to_datetime(account['open_date']), end_date=datetime.today()), + "balance_after": 0, + "vendor": None, + "transaction_location": None + } + + if t_type in ["Deposit", "Withdrawal"]: + # Pick one of the branches for deposit/withdrawal + transaction_data["branch_id"] = account['branch_id'] + amount = round(random.uniform(50, 7000), 2) if t_type == "Withdrawal" else round(random.uniform(20, 10000), 2) + if t_type == "Withdrawal": + amount = min(amount, account['balance']) + account['balance'] -= amount + else: + account['balance'] += amount + transaction_data["amount"] = amount + transaction_data["balance_after"] = round(account['balance'], 2) + transaction_data["transaction_location"] = f"Branch {account['branch_id']}" + + else: # Payment or Transfer + transaction_data["branch_id"] = None + transaction_data["vendor"] = random.choice(vendors) + amount = round(random.uniform(5, 1000), 2) + account['balance'] = max(account['balance'] - amount, 0) + transaction_data["amount"] = amount + transaction_data["balance_after"] = round(account['balance'], 2) + transaction_data["transaction_location"] = "POS / Online" + + return transaction_data + +# ---- Generate transactions ---- +transactions = [] +idx = 1 + +for _, account in accounts_df.iterrows(): + account_transactions_count = random.randint(5, 20) + for _ in range(account_transactions_count): + txn = generate_transaction(account) + txn['transaction_id'] = generate_transaction_id(account['account_id'], idx) + transactions.append(txn) + idx += 1 + +# ---- Convert to DataFrame ---- +transactions_df = pd.DataFrame(transactions) + +# ---- Save locally ---- +local_transactions_file = "../Data/transactions.csv" +transactions_df.to_csv(local_transactions_file, index=False) +print("Generated transactions.csv locally with realistic branch/vendor data.") + +# ---- Upload to S3 ---- +try: + s3.Bucket(bucket_name).upload_file(local_transactions_file, transactions_s3_key) + print(f"Uploaded transactions.csv to s3://{bucket_name}/{transactions_s3_key}") +except Exception as e: + print("ERROR: Could not upload transactions.csv to S3:", e)