updated script logic and generated sample data

2025-12-09 16:37:53 -07:00 · 2025-12-09 16:37:53 -07:00 · 125aa5e122
commit 125aa5e122
parent 5e911b4690
3 changed files with 338 additions and 0 deletions
--- a/Scripts/Generate_accounts.py
+++ b/Scripts/Generate_accounts.py
@ -0,0 +1,114 @@
+from faker import Faker
+from dotenv import load_dotenv
+from datetime import datetime
+import os
+import random
+import pandas as pd
+import boto3
+
+# ---- Setup ----
+fake = Faker()
+load_dotenv()
+
+s3 = boto3.resource(
+    "s3",
+    endpoint_url=os.getenv("STORAGE_ENDPOINT"),
+    aws_access_key_id=os.getenv("STORAGE_ACCESS_KEY"),
+    aws_secret_access_key=os.getenv("STORAGE_SECRET_KEY")
+)
+
+bucket_name = os.getenv("STORAGE_BUCKET")
+customers_key = "DataLab/customers/customers.csv"
+accounts_s3_key = "DataLab/accounts/accounts.csv"
+
+# ---- Ensure local data folder exists ----
+os.makedirs("../Data", exist_ok=True)
+
+# ---- Download customers.csv from S3 ----
+local_customers_file = "../Data/customers.csv"
+try:
+    s3.Bucket(bucket_name).download_file(customers_key, local_customers_file)
+    print("Downloaded customers.csv from S3.")
+except Exception as e:
+    print("ERROR: Could not download customers.csv:", e)
+    raise SystemExit()
+
+# ---- Load customers DataFrame ----
+customers_df = pd.read_csv(local_customers_file)
+
+# Convert customer_since to actual date objects
+customers_df["customer_since"] = pd.to_datetime(customers_df["customer_since"]).dt.date
+
+# ---- Helper Functions ----
+
+def generate_account_id(branch_id):
+    """Generate realistic branch-coded account IDs (11–12 digits)."""
+    branch_part = str(branch_id).zfill(3)  # 3-digit branch ID
+    random_part = str(random.randint(10**8, 10**9 - 1))  # 8–9 random digits
+    return branch_part + random_part
+
+def generate_account_number():
+    """Generate realistic 11-digit bank account numbers."""
+    return str(random.randint(10**10, (10**11) - 1))
+
+def assign_account_types():
+    """
+    Assign 1–2 accounts per customer using realistic rules:
+      - ~50% Checking Only
+      - ~20% Savings Only
+      - ~30% Both
+    """
+    roll = random.random()
+
+    if roll < 0.50:
+        return ["Checking"]
+    elif roll < 0.70:
+        return ["Savings"]
+    else:
+        return ["Checking", "Savings"]
+
+def balance_for_type(account_type):
+    """Give realistic account balances."""
+    if account_type == "Checking":
+        return round(random.uniform(50, 7000), 2)
+    else:  # Savings
+        return round(random.uniform(200, 25000), 2)
+
+# ---- Generate accounts ----
+accounts = []
+
+for _, row in customers_df.iterrows():
+    customer_id = row["customer_id"]
+    customer_since = row["customer_since"]
+    home_branch_id = row["home_branch_id"]
+
+    # Determine which account types this customer owns
+    account_types = assign_account_types()
+
+    for acct_type in account_types:
+        accounts.append({
+            "account_id": generate_account_id(home_branch_id),
+            "account_number": generate_account_number(),
+            "customer_id": customer_id,
+            "account_type": acct_type,
+            "open_date": fake.date_between(
+                start_date=customer_since, end_date=datetime.today().date()
+            ),
+            "balance": balance_for_type(acct_type),
+            "branch_id": home_branch_id
+        })
+
+# ---- Convert to DataFrame ----
+accounts_df = pd.DataFrame(accounts)
+
+# ---- Save locally ----
+local_accounts_file = "../Data/accounts.csv"
+accounts_df.to_csv(local_accounts_file, index=False)
+print("Generated accounts.csv locally.")
+
+# ---- Upload to S3 ----
+try:
+    s3.Bucket(bucket_name).upload_file(local_accounts_file, accounts_s3_key)
+    print(f"Uploaded accounts.csv to s3://{bucket_name}/{accounts_s3_key}")
+except Exception as e:
+    print("ERROR: Could not upload accounts.csv to S3:", e)
--- a/Scripts/Generate_customers.py
+++ b/Scripts/Generate_customers.py
@ -0,0 +1,109 @@
+from faker import Faker
+from dotenv import load_dotenv
+import os
+import pandas as pd
+import boto3
+import random
+from datetime import datetime
+
+fake = Faker()
+
+# ---- Load env ----
+load_dotenv()
+
+# ---- Hetzner S3 setup ----
+s3 = boto3.resource(
+    "s3",
+    endpoint_url=os.getenv("STORAGE_ENDPOINT"),
+    aws_access_key_id=os.getenv("STORAGE_ACCESS_KEY"),
+    aws_secret_access_key=os.getenv("STORAGE_SECRET_KEY")
+)
+
+bucket_name = os.getenv("STORAGE_BUCKET")
+customers_s3_key = "DataLab/customers/customers.csv"
+branches_s3_key = "DataLab/branches/branches.csv"
+
+# ---- Load branches from S3 ----
+branches_local = "../Data/branches.csv"
+s3.Bucket(bucket_name).download_file(branches_s3_key, branches_local)
+branches = pd.read_csv(branches_local)
+
+# ---- Helper functions ----
+def realistic_credit_score():
+    """Normal distribution around 680."""
+    score = int(random.gauss(680, 60))
+    return max(300, min(score, 850))
+
+def realistic_income():
+    brackets = [
+        (20000, 40000),
+        (40000, 70000),
+        (70000, 120000),
+        (120000, 200000)
+    ]
+    low, high = random.choice(brackets)
+    return random.randint(low, high)
+
+def realistic_employment():
+    return random.choices(
+        ["Employed", "Self-Employed", "Unemployed", "Student", "Retired"],
+        weights=[50, 15, 10, 15, 10]
+    )[0]
+
+def realistic_contact():
+    return random.choice(["Email", "Phone", "SMS"])
+
+# ---- Generate Customers ----
+customers = []
+start_id = 100000  # Realistic banking customer IDs
+
+for i in range(50):
+    first = fake.first_name()
+    last = fake.last_name()
+    
+    dob = fake.date_between(start_date="-80y", end_date="-18y")
+    age = (datetime.now().date() - dob).days // 365
+
+    income = realistic_income()
+    credit = realistic_credit_score()
+
+    customers.append({
+        "customer_id": start_id + i,
+        "first_name": first,
+        "last_name": last,
+        "full_name": f"{first} {last}",
+        "email": f"{first.lower()}.{last.lower()}@{fake.free_email_domain()}",
+        "phone": fake.phone_number(),
+        "date_of_birth": dob,
+        "age": age,
+        "gender": random.choice(["Male", "Female", "Other"]),
+        "street_address": fake.street_address(),
+        "city": fake.city(),
+        "state": fake.state_abbr(),
+        "zip_code": fake.zipcode(),
+        "home_branch_id": random.choice(branches["branch_id"]),
+        "customer_since": fake.date_between(start_date="-10y", end_date="today"),
+        "employment_status": realistic_employment(),
+        "annual_income": income,
+        "credit_score": credit,
+        "preferred_contact_method": realistic_contact(),
+        "is_high_value_customer": income > 120000 or credit > 750,
+        "age_group": (
+            "18-25" if age < 26 else
+            "26-35" if age < 36 else
+            "36-50" if age < 51 else
+            "51-65" if age < 66 else
+            "66+"
+        )
+    })
+
+df = pd.DataFrame(customers)
+
+# ---- Save locally ----
+local_file = "../Data/customers.csv"
+df.to_csv(local_file, index=False)
+print("Generated realistic customers.")
+
+# ---- Upload to S3 ----
+s3.Bucket(bucket_name).upload_file(local_file, customers_s3_key)
+print(f"Uploaded customers.csv to s3://{bucket_name}/{customers_s3_key}")
--- a/Scripts/Generate_transactions.py
+++ b/Scripts/Generate_transactions.py
@ -0,0 +1,115 @@
+from faker import Faker
+from dotenv import load_dotenv
+from datetime import datetime
+import os
+import random
+import pandas as pd
+import boto3
+
+# ---- Setup ----
+fake = Faker()
+load_dotenv()
+
+s3 = boto3.resource(
+    "s3",
+    endpoint_url=os.getenv("STORAGE_ENDPOINT"),
+    aws_access_key_id=os.getenv("STORAGE_ACCESS_KEY"),
+    aws_secret_access_key=os.getenv("STORAGE_SECRET_KEY")
+)
+
+bucket_name = os.getenv("STORAGE_BUCKET")
+accounts_key = "DataLab/accounts/accounts.csv"
+transactions_s3_key = "DataLab/transactions/transactions.csv"
+
+# ---- Ensure local data folder exists ----
+os.makedirs("../Data", exist_ok=True)
+
+# ---- Download accounts.csv from S3 ----
+local_accounts_file = "../Data/accounts.csv"
+try:
+    s3.Bucket(bucket_name).download_file(accounts_key, local_accounts_file)
+    print("Downloaded accounts.csv from S3.")
+except Exception as e:
+    print("ERROR: Could not download accounts.csv:", e)
+    raise SystemExit()
+
+# ---- Load accounts DataFrame ----
+accounts_df = pd.read_csv(local_accounts_file)
+
+# ---- Sample vendors ----
+vendors = ["Amazon", "Walmart", "Target", "Starbucks", "Apple", "Netflix", "Uber", "Lyft", "BestBuy", "Costco"]
+
+# ---- Helper Functions ----
+def generate_transaction_id(account_id, idx):
+    """Generate a unique transaction ID combining account ID and index."""
+    return f"{account_id}{str(idx).zfill(5)}"
+
+def generate_transaction(account):
+    """Generate a realistic transaction for a given account."""
+    t_type = random.choices(
+        ["Deposit", "Withdrawal", "Payment", "Transfer"], 
+        weights=[0.4, 0.3, 0.2, 0.1], k=1
+    )[0]
+
+    transaction_data = {
+        "transaction_id": None,  # fill later
+        "account_id": account['account_id'],
+        "branch_id": None,
+        "transaction_type": t_type,
+        "amount": 0,
+        "date": fake.date_between(start_date=pd.to_datetime(account['open_date']), end_date=datetime.today()),
+        "balance_after": 0,
+        "vendor": None,
+        "transaction_location": None
+    }
+
+    if t_type in ["Deposit", "Withdrawal"]:
+        # Pick one of the branches for deposit/withdrawal
+        transaction_data["branch_id"] = account['branch_id']
+        amount = round(random.uniform(50, 7000), 2) if t_type == "Withdrawal" else round(random.uniform(20, 10000), 2)
+        if t_type == "Withdrawal":
+            amount = min(amount, account['balance'])
+            account['balance'] -= amount
+        else:
+            account['balance'] += amount
+        transaction_data["amount"] = amount
+        transaction_data["balance_after"] = round(account['balance'], 2)
+        transaction_data["transaction_location"] = f"Branch {account['branch_id']}"
+
+    else:  # Payment or Transfer
+        transaction_data["branch_id"] = None
+        transaction_data["vendor"] = random.choice(vendors)
+        amount = round(random.uniform(5, 1000), 2)
+        account['balance'] = max(account['balance'] - amount, 0)
+        transaction_data["amount"] = amount
+        transaction_data["balance_after"] = round(account['balance'], 2)
+        transaction_data["transaction_location"] = "POS / Online"
+
+    return transaction_data
+
+# ---- Generate transactions ----
+transactions = []
+idx = 1
+
+for _, account in accounts_df.iterrows():
+    account_transactions_count = random.randint(5, 20)
+    for _ in range(account_transactions_count):
+        txn = generate_transaction(account)
+        txn['transaction_id'] = generate_transaction_id(account['account_id'], idx)
+        transactions.append(txn)
+        idx += 1
+
+# ---- Convert to DataFrame ----
+transactions_df = pd.DataFrame(transactions)
+
+# ---- Save locally ----
+local_transactions_file = "../Data/transactions.csv"
+transactions_df.to_csv(local_transactions_file, index=False)
+print("Generated transactions.csv locally with realistic branch/vendor data.")
+
+# ---- Upload to S3 ----
+try:
+    s3.Bucket(bucket_name).upload_file(local_transactions_file, transactions_s3_key)
+    print(f"Uploaded transactions.csv to s3://{bucket_name}/{transactions_s3_key}")
+except Exception as e:
+    print("ERROR: Could not upload transactions.csv to S3:", e)