from faker import Faker from dotenv import load_dotenv from datetime import datetime import os import random import pandas as pd import boto3 # ---- Setup ---- fake = Faker() load_dotenv() s3 = boto3.resource( "s3", endpoint_url=os.getenv("STORAGE_ENDPOINT"), aws_access_key_id=os.getenv("STORAGE_ACCESS_KEY"), aws_secret_access_key=os.getenv("STORAGE_SECRET_KEY") ) bucket_name = os.getenv("STORAGE_BUCKET") customers_key = "DataLab/customers/customers.csv" accounts_s3_key = "DataLab/accounts/accounts.csv" # ---- Ensure local data folder exists ---- os.makedirs("../Data", exist_ok=True) # ---- Download customers.csv from S3 ---- local_customers_file = "../Data/customers.csv" try: s3.Bucket(bucket_name).download_file(customers_key, local_customers_file) print("Downloaded customers.csv from S3.") except Exception as e: print("ERROR: Could not download customers.csv:", e) raise SystemExit() # ---- Load customers DataFrame ---- customers_df = pd.read_csv(local_customers_file) # Convert customer_since to actual date objects customers_df["customer_since"] = pd.to_datetime(customers_df["customer_since"]).dt.date # ---- Helper Functions ---- def generate_account_id(branch_id): """Generate realistic branch-coded account IDs (11–12 digits).""" branch_part = str(branch_id).zfill(3) # 3-digit branch ID random_part = str(random.randint(10**8, 10**9 - 1)) # 8–9 random digits return branch_part + random_part def generate_account_number(): """Generate realistic 11-digit bank account numbers.""" return str(random.randint(10**10, (10**11) - 1)) def assign_account_types(): """ Assign 1–2 accounts per customer using realistic rules: - ~50% Checking Only - ~20% Savings Only - ~30% Both """ roll = random.random() if roll < 0.50: return ["Checking"] elif roll < 0.70: return ["Savings"] else: return ["Checking", "Savings"] def balance_for_type(account_type): """Give realistic account balances.""" if account_type == "Checking": return round(random.uniform(50, 7000), 2) else: # Savings return round(random.uniform(200, 25000), 2) # ---- Generate accounts ---- accounts = [] for _, row in customers_df.iterrows(): customer_id = row["customer_id"] customer_since = row["customer_since"] home_branch_id = row["home_branch_id"] # Determine which account types this customer owns account_types = assign_account_types() for acct_type in account_types: accounts.append({ "account_id": generate_account_id(home_branch_id), "account_number": generate_account_number(), "customer_id": customer_id, "account_type": acct_type, "open_date": fake.date_between( start_date=customer_since, end_date=datetime.today().date() ), "balance": balance_for_type(acct_type), "branch_id": home_branch_id }) # ---- Convert to DataFrame ---- accounts_df = pd.DataFrame(accounts) # ---- Save locally ---- local_accounts_file = "../Data/accounts.csv" accounts_df.to_csv(local_accounts_file, index=False) print("Generated accounts.csv locally.") # ---- Upload to S3 ---- try: s3.Bucket(bucket_name).upload_file(local_accounts_file, accounts_s3_key) print(f"Uploaded accounts.csv to s3://{bucket_name}/{accounts_s3_key}") except Exception as e: print("ERROR: Could not upload accounts.csv to S3:", e)