diff --git a/Scripts/Generate_accounts.py b/Scripts/Generate_accounts.py deleted file mode 100644 index 909427e..0000000 --- a/Scripts/Generate_accounts.py +++ /dev/null @@ -1,114 +0,0 @@ -from faker import Faker -from dotenv import load_dotenv -from datetime import datetime -import os -import random -import pandas as pd -import boto3 - -# ---- Setup ---- -fake = Faker() -load_dotenv() - -s3 = boto3.resource( - "s3", - endpoint_url=os.getenv("STORAGE_ENDPOINT"), - aws_access_key_id=os.getenv("STORAGE_ACCESS_KEY"), - aws_secret_access_key=os.getenv("STORAGE_SECRET_KEY") -) - -bucket_name = os.getenv("STORAGE_BUCKET") -customers_key = "DataLab/customers/customers.csv" -accounts_s3_key = "DataLab/accounts/accounts.csv" - -# ---- Ensure local data folder exists ---- -os.makedirs("../Data", exist_ok=True) - -# ---- Download customers.csv from S3 ---- -local_customers_file = "../Data/customers.csv" -try: - s3.Bucket(bucket_name).download_file(customers_key, local_customers_file) - print("Downloaded customers.csv from S3.") -except Exception as e: - print("ERROR: Could not download customers.csv:", e) - raise SystemExit() - -# ---- Load customers DataFrame ---- -customers_df = pd.read_csv(local_customers_file) - -# Convert customer_since to actual date objects -customers_df["customer_since"] = pd.to_datetime(customers_df["customer_since"]).dt.date - -# ---- Helper Functions ---- - -def generate_account_id(branch_id): - """Generate realistic branch-coded account IDs (11–12 digits).""" - branch_part = str(branch_id).zfill(3) # 3-digit branch ID - random_part = str(random.randint(10**8, 10**9 - 1)) # 8–9 random digits - return branch_part + random_part - -def generate_account_number(): - """Generate realistic 11-digit bank account numbers.""" - return str(random.randint(10**10, (10**11) - 1)) - -def assign_account_types(): - """ - Assign 1–2 accounts per customer using realistic rules: - - ~50% Checking Only - - ~20% Savings Only - - ~30% Both - """ - roll = random.random() - - if roll < 0.50: - return ["Checking"] - elif roll < 0.70: - return ["Savings"] - else: - return ["Checking", "Savings"] - -def balance_for_type(account_type): - """Give realistic account balances.""" - if account_type == "Checking": - return round(random.uniform(50, 7000), 2) - else: # Savings - return round(random.uniform(200, 25000), 2) - -# ---- Generate accounts ---- -accounts = [] - -for _, row in customers_df.iterrows(): - customer_id = row["customer_id"] - customer_since = row["customer_since"] - home_branch_id = row["home_branch_id"] - - # Determine which account types this customer owns - account_types = assign_account_types() - - for acct_type in account_types: - accounts.append({ - "account_id": generate_account_id(home_branch_id), - "account_number": generate_account_number(), - "customer_id": customer_id, - "account_type": acct_type, - "open_date": fake.date_between( - start_date=customer_since, end_date=datetime.today().date() - ), - "balance": balance_for_type(acct_type), - "branch_id": home_branch_id - }) - -# ---- Convert to DataFrame ---- -accounts_df = pd.DataFrame(accounts) - -# ---- Save locally ---- -local_accounts_file = "../Data/accounts.csv" -accounts_df.to_csv(local_accounts_file, index=False) -print("Generated accounts.csv locally.") - -# ---- Upload to S3 ---- -try: - s3.Bucket(bucket_name).upload_file(local_accounts_file, accounts_s3_key) - print(f"Uploaded accounts.csv to s3://{bucket_name}/{accounts_s3_key}") -except Exception as e: - print("ERROR: Could not upload accounts.csv to S3:", e)