updated scripts to use postgres

2025-12-10 15:59:45 -07:00 · 2025-12-10 15:59:45 -07:00 · d76a55020f
commit d76a55020f
parent 5b6eb89627
4 changed files with 247 additions and 202 deletions
--- a/Scripts/accounts.py
+++ b/Scripts/accounts.py
@ -7,54 +7,49 @@ import pandas as pd
 import boto3
 import io
 from sqlalchemy import create_engine, text
 from urllib.parse import quote_plus
 # ---- Setup ----
 fake = Faker()
 load_dotenv()
-# ---- S3 Setup ----
+# ---- Postgres setup ----
 s3 = boto3.resource(
    "s3",
    endpoint_url=os.getenv("STORAGE_ENDPOINT"),
    aws_access_key_id=os.getenv("STORAGE_ACCESS_KEY"),
    aws_secret_access_key=os.getenv("STORAGE_SECRET_KEY")
 )
 bucket_name = os.getenv("STORAGE_BUCKET")
 customers_key_csv = "DataLab/customers/customers.csv"
 accounts_s3_key_parquet = "DataLab/accounts/accounts.parquet"
 # ---- Postgres Setup (optional) ----
 user = os.getenv("PG_USER")
 password = os.getenv("PG_PASSWORD")
 host = os.getenv("PG_HOST")
 port = "5432"
 db = "postgres"
-engine = create_engine(f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{db}")
+engine = create_engine(f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{db}", future=True)
-# ---- Ensure local data folder exists ----
+# ---- S3 setup (backup only) ----
-os.makedirs("../Data", exist_ok=True)
+s3 = boto3.resource(
    "s3",
    endpoint_url=os.getenv("STORAGE_ENDPOINT"),
    aws_access_key_id=os.getenv("STORAGE_ACCESS_KEY"),
    aws_secret_access_key=os.getenv("STORAGE_SECRET_KEY")
 )
 bucket_name = os.getenv("STORAGE_BUCKET")
 accounts_s3_key_parquet = "DataLab/accounts/accounts.parquet"
-# ---- Download customers.csv from S3 ----
+# ---- Load customers from Postgres ----
-local_customers_file = "../Data/customers.csv"
+with engine.connect() as conn:
-try:
+    customers_df = pd.read_sql(
-    s3.Bucket(bucket_name).download_file(customers_key_csv, local_customers_file)
+        sql=text("SELECT customer_id, home_branch_id, customer_since FROM customers;"),
-    print("Downloaded customers.csv from S3.")
+        con=conn
-except Exception as e:
+    )
    print("ERROR: Could not download customers.csv:", e)
    raise SystemExit()
 # ---- Load customers DataFrame ----
 customers_df = pd.read_csv(local_customers_file)
 customers_df["customer_since"] = pd.to_datetime(customers_df["customer_since"]).dt.date
-# ---- Helper Functions ----
+# ---- Unique account ID generator ----
 generated_ids = set()
 def generate_account_id(branch_id):
    while True:
        branch_part = str(branch_id).zfill(3)
        random_part = str(random.randint(10**8, 10**9 - 1))
-    return branch_part + random_part
+        acct_id = branch_part + random_part
        if acct_id not in generated_ids:
            generated_ids.add(acct_id)
            return acct_id
 def generate_account_number():
    return str(random.randint(10**10, 10**11 - 1))
@ -94,40 +89,26 @@ for _, row in customers_df.iterrows():
 accounts_df = pd.DataFrame(accounts)
-# ---- Save locally as CSV ----
+# ---- Save to S3 backup ----
-local_accounts_file = "../Data/accounts.csv"
+buffer = io.BytesIO()
-accounts_df.to_csv(local_accounts_file, index=False)
+accounts_df.to_parquet(buffer, index=False, engine="pyarrow")
-print("Generated accounts.csv locally.")
+s3.Bucket(bucket_name).put_object(Key=accounts_s3_key_parquet, Body=buffer.getvalue())
 print("Uploaded accounts.parquet to S3 (backup).")
-# ---- Upload / append to S3 as Parquet ----
+# ---- Ensure accounts table exists and insert into Postgres ----
-try:
+with engine.begin() as conn:
-    obj = s3.Bucket(bucket_name).Object(accounts_s3_key_parquet).get()
+    conn.execute(text("""
-    existing_df = pd.read_parquet(io.BytesIO(obj['Body'].read()))
+        CREATE TABLE IF NOT EXISTS accounts (
-    combined_df = pd.concat([existing_df, accounts_df], ignore_index=True)
+            account_id      VARCHAR(20) PRIMARY KEY,
-    print(f"Appended {len(accounts_df)} rows to existing S3 Parquet")
+            account_number  VARCHAR(20) UNIQUE,
-except s3.meta.client.exceptions.NoSuchKey:
+            customer_id     BIGINT REFERENCES customers(customer_id),
-    combined_df = accounts_df
+            account_type    VARCHAR(50),
-    print("No existing Parquet on S3, creating new one")
+            open_date       DATE,
-
+            balance         NUMERIC(12,2),
-parquet_buffer = io.BytesIO()
+            branch_id       INT REFERENCES branches(branch_id)
-combined_df.to_parquet(parquet_buffer, index=False, engine="pyarrow")
+        );
-s3.Bucket(bucket_name).put_object(Key=accounts_s3_key_parquet, Body=parquet_buffer.getvalue())
+    """))
 print(f"Uploaded accounts.parquet to s3")
 # ---- Append to Postgres ----
 try:
    accounts_df.to_sql(
    "accounts",
    engine,
    if_exists="append",
    index=False,
    method='multi'  # faster inserts
 )
 except Exception as e:
    print("Failed to insert into Postgres:")
 # ---- Optional: row count check ----
 with engine.connect() as conn:
    existing_ids = pd.read_sql("SELECT account_id FROM accounts;", conn)
    accounts_df = accounts_df[~accounts_df['account_id'].isin(existing_ids['account_id'])]
    # Pandas to_sql now uses the connection from SQLAlchemy 2.x
    accounts_df.to_sql("accounts", conn, if_exists="append", index=False, method="multi")
    print(f"Inserted {len(accounts_df)} accounts into Postgres successfully!")
--- a/Scripts/branches.py
+++ b/Scripts/branches.py
@ -0,0 +1,93 @@
 from faker import Faker
 from dotenv import load_dotenv
 import os
 import pandas as pd
 import boto3
 import io
 from sqlalchemy import create_engine, text
 from urllib.parse import quote_plus
 # ---- Faker setup ----
 fake = Faker()
 load_dotenv()
 # ---- S3 Setup ----
 s3 = boto3.resource(
    's3',
    endpoint_url=os.getenv('STORAGE_ENDPOINT'),
    aws_access_key_id=os.getenv('STORAGE_ACCESS_KEY'),
    aws_secret_access_key=os.getenv('STORAGE_SECRET_KEY')
 )
 bucket_name = os.getenv('STORAGE_BUCKET')
 s3_key_csv = 'DataLab/branches/branches.csv'
 s3_key_parquet = 'DataLab/branches/branches.parquet'
 # ---- Postgres Setup ----
 user = os.getenv("PG_USER")
 password = os.getenv("PG_PASSWORD")
 host = os.getenv("PG_HOST")
 port = "5432"
 db = "postgres"
 engine = create_engine(f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{db}")
 # ---- Ensure local data folder exists ----
 os.makedirs("../Data", exist_ok=True)
 # ---- Generate branch data ----
 branches = []
 for i in range(1, 11):  # 10 branches
    branches.append({
        "branch_id": str(i),  # store as string for consistency
        "branch_name": f"{fake.city()} Branch",
        "address": fake.street_address(),
        "city": fake.city(),
        "state": fake.state_abbr()
    })
 df = pd.DataFrame(branches)
 # ---- Save locally as CSV ----
 local_file = "../Data/branches.csv"
 df.to_csv(local_file, index=False)
 print("Generated 10 branches locally.")
 # ---- Upload CSV to S3 ----
 s3.Bucket(bucket_name).upload_file(local_file, s3_key_csv)
 print(f"Uploaded branches.csv to s3://{bucket_name}/{s3_key_csv}")
 # ---- Upload / append to S3 as Parquet ----
 try:
    obj = s3.Bucket(bucket_name).Object(s3_key_parquet).get()
    existing_df = pd.read_parquet(io.BytesIO(obj['Body'].read()))
    combined_df = pd.concat([existing_df, df], ignore_index=True)
    print(f"Appended {len(df)} branches to existing Parquet on S3.")
 except s3.meta.client.exceptions.NoSuchKey:
    combined_df = df
    print("No existing branches Parquet on S3, creating new one.")
 parquet_buffer = io.BytesIO()
 combined_df.to_parquet(parquet_buffer, index=False, engine="pyarrow")
 s3.Bucket(bucket_name).put_object(Key=s3_key_parquet, Body=parquet_buffer.getvalue())
 print(f"Uploaded branches.parquet to s3://{bucket_name}/{s3_key_parquet}")
 # ---- Create / Append to Postgres ----
 with engine.connect() as conn:
    for _, row in df.iterrows():
        stmt = text("""
            INSERT INTO branches (branch_id, branch_name, address, city, state)
            VALUES (:branch_id, :branch_name, :address, :city, :state)
            ON CONFLICT (branch_id) DO NOTHING
        """)
        conn.execute(stmt, {
            "branch_id": str(row["branch_id"]),
            "branch_name": row["branch_name"],
            "address": row["address"],
            "city": row["city"],
            "state": row["state"]
        })
    conn.commit()
    # Optional: row count check
    result = conn.execute(text("SELECT COUNT(*) FROM branches;"))
    print(f"Rows in branches table: {result.scalar()}")
--- a/Scripts/customers.py
+++ b/Scripts/customers.py
@ -8,51 +8,57 @@ import pandas as pd
 import boto3
 import random
 from datetime import datetime
 import pyarrow.parquet as pq
 user = os.getenv("PG_USER")
 password = os.getenv("PG_PASSWORD")
 host = os.getenv("PG_HOST")
 port = "5432"
 db = "postgres"
 engine = create_engine(f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{db}")
 fake = Faker()
 # ---- Load env ----
 load_dotenv()
 fake = Faker()
-# ---- Hetzner S3 setup ----
+# ---- Postgres setup ----
 user = os.getenv("PG_USER")
 password = quote_plus(os.getenv("PG_PASSWORD"))
 host = os.getenv("PG_HOST")
 port = "5432"
 db = "postgres"
 engine = create_engine(f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{db}")
 # ---- Hetzner S3 setup ---- (backup only) ----
 s3 = boto3.resource(
    "s3",
    endpoint_url=os.getenv("STORAGE_ENDPOINT"),
    aws_access_key_id=os.getenv("STORAGE_ACCESS_KEY"),
    aws_secret_access_key=os.getenv("STORAGE_SECRET_KEY")
 )
 bucket_name = os.getenv("STORAGE_BUCKET")
 customers_s3_key = "DataLab/customers/customers.csv"
 branches_s3_key = "DataLab/branches/branches.csv"
 customers_s3_key = "DataLab/customers/customers.parquet"
-# ---- Load branches from S3 ----
+# ---- Load branches from S3 (still needed for customer assignment) ----
 branches_local = "../Data/branches.csv"
 s3.Bucket(bucket_name).download_file(branches_s3_key, branches_local)
 branches = pd.read_csv(branches_local)
 # ---- Load existing customers from Postgres for email uniqueness ----
 with engine.connect() as conn:
    table_exists = conn.execute(
        text("SELECT EXISTS (SELECT 1 FROM information_schema.tables WHERE table_name='customers');")
    ).scalar()
    if table_exists:
        existing_customers = pd.read_sql(
            text("SELECT email FROM customers;"),
            con=conn
        )
        existing_emails = set(existing_customers["email"]) if not existing_customers.empty else set()
    else:
        existing_emails = set()
 # ---- Helper functions ----
 def realistic_credit_score():
-    """Normal distribution around 680."""
+    return max(300, min(int(random.gauss(680, 60)), 850))
    score = int(random.gauss(680, 60))
    return max(300, min(score, 850))
 def realistic_income():
-    brackets = [
+    brackets = [(20000,40000),(40000,70000),(70000,120000),(120000,200000)]
        (20000, 40000),
        (40000, 70000),
        (70000, 120000),
        (120000, 200000)
    ]
    low, high = random.choice(brackets)
    return random.randint(low, high)
@ -65,26 +71,31 @@ def realistic_employment():
 def realistic_contact():
    return random.choice(["Email","Phone","SMS"])
 def generate_customer_id():
    return random.getrandbits(48)
 # ---- Generate Customers ----
 customers = []
-start_id = 100000  # Realistic banking customer IDs
+for _ in range(50):
 for i in range(50):
    first = fake.first_name()
    last = fake.last_name()
    email = f"{first.lower()}.{last.lower()}@{fake.free_email_domain()}"
    while email in existing_emails:
        first = fake.first_name()
        last = fake.last_name()
        email = f"{first.lower()}.{last.lower()}@{fake.free_email_domain()}"
    existing_emails.add(email)
    dob = fake.date_between(start_date="-80y", end_date="-18y")
    age = (datetime.now().date() - dob).days // 365
    income = realistic_income()
    credit = realistic_credit_score()
    customers.append({
-        "customer_id": start_id + i,
+        "customer_id": generate_customer_id(),
        "first_name": first,
        "last_name": last,
        "full_name": f"{first} {last}",
-        "email": f"{first.lower()}.{last.lower()}@{fake.free_email_domain()}",
+        "email": email,
        "phone": fake.phone_number(),
        "date_of_birth": dob,
        "age": age,
@ -103,46 +114,12 @@ for i in range(50):
 df = pd.DataFrame(customers)
-# ---- Save locally ----
+# ---- Save to S3 backup ----
-local_file = f"../Data/customers_{datetime.now():%Y%m%d_%H%M%S}.csv"
+buffer = io.BytesIO()
-df.to_csv(local_file, index=False)
+df.to_parquet(buffer, index=False, engine="pyarrow")
-print("Generated customers.")
+s3.Bucket(bucket_name).put_object(Key=customers_s3_key, Body=buffer.getvalue())
 print("Uploaded customers.parquet to S3 (backup).")
-
+# ---- Insert into Postgres ----
-# ---- Upload / append to S3 as Parquet ----
+df.to_sql("customers", engine, if_exists="append", index=False, method="multi")
 customers_s3_key = "DataLab/customers/customers.parquet"
 try:
    # Check if Parquet exists
    obj = s3.Bucket(bucket_name).Object(customers_s3_key).get()
    existing_df = pd.read_parquet(io.BytesIO(obj['Body'].read()))
    df_s3 = pd.concat([existing_df, df], ignore_index=True)
    print(f"Appended {len(df_s3)} rows to existing S3 Parquet")
 except s3.meta.client.exceptions.NoSuchKey:
    # No existing file
    df_s3 = df
    print("No existing Parquet on S3, creating new one")
 # Convert to Parquet buffer
 parquet_buffer = io.BytesIO()
 df_s3.to_parquet(parquet_buffer, index=False, engine="pyarrow")
 # Upload to S3
 s3.Bucket(bucket_name).put_object(
    Key=customers_s3_key,
    Body=parquet_buffer.getvalue()
 )
 print(f"Uploaded customers.parquet to s3")
 # ---- Write customers to Postgres ----
 try:
    df.to_sql("customers", engine, if_exists="append", index=False)
 print("Inserted customers into Postgres successfully!")
 except Exception as e:
    print("Failed to insert into Postgres:", e)
 with engine.connect() as conn:
    result = conn.execute(text("SELECT COUNT(*) FROM customers;"))
    print(f"Rows in customers table: {result.scalar()}")
--- a/Scripts/transactions.py
+++ b/Scripts/transactions.py
@ -7,50 +7,42 @@ import pandas as pd
 import boto3
 import io
 from sqlalchemy import create_engine, text
 from urllib.parse import quote_plus
 # ---- Setup ----
 fake = Faker()
 load_dotenv()
-# ---- S3 Setup ----
+# ---- Postgres setup ----
 user = os.getenv("PG_USER")
 password = os.getenv("PG_PASSWORD")
 host = os.getenv("PG_HOST")
 port = "5432"
 db = "postgres"
 engine = create_engine(f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{db}", future=True)
 # ---- S3 setup (backup only) ----
 s3 = boto3.resource(
    "s3",
    endpoint_url=os.getenv("STORAGE_ENDPOINT"),
    aws_access_key_id=os.getenv("STORAGE_ACCESS_KEY"),
    aws_secret_access_key=os.getenv("STORAGE_SECRET_KEY")
 )
 bucket_name = os.getenv("STORAGE_BUCKET")
 accounts_s3_key_parquet = "DataLab/accounts/accounts.parquet"
 transactions_s3_key_parquet = "DataLab/transactions/transactions.parquet"
-# ---- Postgres Setup ----
+# ---- Load accounts from Postgres ----
-PG_USER = os.getenv("PG_USER")
+with engine.connect() as conn:
-PG_PASSWORD = quote_plus(os.getenv("PG_PASSWORD"))
+    accounts_df = pd.read_sql(
-PG_HOST = os.getenv("PG_HOST")
+        sql=text("SELECT account_id, customer_id, branch_id, account_type, open_date, balance FROM accounts;"),
-PG_PORT = "5432"
+        con=conn
-PG_DB = "postgres"
+    )
-engine = create_engine(f"postgresql+psycopg2://{PG_USER}:{PG_PASSWORD}@{PG_HOST}:{PG_PORT}/{PG_DB}")
+accounts_df["open_date"] = pd.to_datetime(accounts_df["open_date"]).dt.date
 # ---- Ensure local data folder exists ----
 os.makedirs("../Data", exist_ok=True)
 # ---- Download accounts parquet from S3 ----
 accounts_df = pd.DataFrame()
 try:
    obj = s3.Bucket(bucket_name).Object(accounts_s3_key_parquet).get()
    accounts_df = pd.read_parquet(io.BytesIO(obj['Body'].read()))
    print("Downloaded accounts.parquet from S3.")
 except s3.meta.client.exceptions.NoSuchKey:
    print("ERROR: Accounts Parquet not found on S3.")
    raise SystemExit()
 # ---- Sample vendors ----
 vendors = ["Amazon", "Walmart", "Target", "Starbucks", "Apple", "Netflix", "Uber", "Lyft", "BestBuy", "Costco"]
-# ---- Helper Functions ----
+# ---- Helper functions ----
 def generate_transaction_id(account_id, idx):
    return f"{account_id}{str(idx).zfill(5)}"
@ -63,28 +55,29 @@ def generate_transaction(account):
    txn = {
        "transaction_id": None,  # to be filled later
        "account_id": str(account['account_id']),
-        "branch_id": None,
+        "branch_id": account['branch_id'] if t_type in ["Deposit", "Withdrawal"] else None,
        "transaction_type": t_type,
        "amount": 0,
        "date": fake.date_between(start_date=pd.to_datetime(account['open_date']), end_date=datetime.today()),
        "balance_after": 0,
-        "vendor": None,
+        "vendor": random.choice(vendors) if t_type in ["Payment", "Transfer"] else None,
        "transaction_location": None
    }
-    if t_type in ["Deposit", "Withdrawal"]:
+    if t_type == "Deposit":
-        txn["branch_id"] = account.get('branch_id', None)
+        amount = round(random.uniform(20, 10000), 2)
        amount = round(random.uniform(50, 7000), 2) if t_type == "Withdrawal" else round(random.uniform(20, 10000), 2)
        if t_type == "Withdrawal":
            amount = min(amount, account['balance'])
            account['balance'] -= amount
        else:
        account['balance'] += amount
        txn["amount"] = amount
        txn["balance_after"] = round(account['balance'], 2)
        txn["transaction_location"] = f"Branch {txn['branch_id']}"
    elif t_type == "Withdrawal":
        amount = round(random.uniform(50, 7000), 2)
        amount = min(amount, account['balance'])
        account['balance'] -= amount
        txn["amount"] = amount
        txn["balance_after"] = round(account['balance'], 2)
        txn["transaction_location"] = f"Branch {txn['branch_id']}"
    else:  # Payment / Transfer
        txn["vendor"] = random.choice(vendors)
        amount = round(random.uniform(5, 1000), 2)
        account['balance'] = max(account['balance'] - amount, 0)
        txn["amount"] = amount
@ -96,7 +89,6 @@ def generate_transaction(account):
 # ---- Generate transactions ----
 transactions = []
 idx = 1
 for _, account in accounts_df.iterrows():
    account_transactions_count = random.randint(5, 20)
    for _ in range(account_transactions_count):
@ -107,28 +99,30 @@ for _, account in accounts_df.iterrows():
 transactions_df = pd.DataFrame(transactions)
-# ---- Upload / append to S3 as Parquet ----
+# ---- Save to S3 backup ----
-try:
+buffer = io.BytesIO()
-    obj = s3.Bucket(bucket_name).Object(transactions_s3_key_parquet).get()
+transactions_df.to_parquet(buffer, index=False, engine="pyarrow")
-    existing_df = pd.read_parquet(io.BytesIO(obj['Body'].read()))
+s3.Bucket(bucket_name).put_object(Key=transactions_s3_key_parquet, Body=buffer.getvalue())
-    combined_df = pd.concat([existing_df, transactions_df], ignore_index=True)
+print("Uploaded transactions.parquet to S3 (backup).")
    print(f"Appended {len(transactions_df)} transactions to existing Parquet on S3.")
 except s3.meta.client.exceptions.NoSuchKey:
    combined_df = transactions_df
    print("No existing transactions Parquet on S3, creating new one.")
-# Convert to Parquet and upload
+# ---- Insert into Postgres ----
-parquet_buffer = io.BytesIO()
+with engine.begin() as conn:
-combined_df.to_parquet(parquet_buffer, index=False, engine="pyarrow")
+    conn.execute(text("""
-s3.Bucket(bucket_name).put_object(Key=transactions_s3_key_parquet, Body=parquet_buffer.getvalue())
+        CREATE TABLE IF NOT EXISTS transactions (
-print(f"Uploaded combined transactions.parquet to s3://{bucket_name}/{transactions_s3_key_parquet}")
+            transaction_id     VARCHAR(30) PRIMARY KEY,
            account_id         VARCHAR(20) REFERENCES accounts(account_id),
            branch_id          INT,
            transaction_type   VARCHAR(20),
            amount             NUMERIC(12,2),
            date               DATE,
            balance_after      NUMERIC(12,2),
            vendor             VARCHAR(50),
            transaction_location VARCHAR(50)
        );
    """))
-# ---- Append to Postgres ----
+    transactions_df.to_sql("transactions", conn, if_exists="append", index=False, method="multi")
-try:
+    print(f"Inserted {len(transactions_df)} transactions into Postgres successfully!")
    combined_df.to_sql("transactions", engine, if_exists="append", index=False)
    print("Inserted transactions into Postgres successfully!")
 except Exception as e:
    print("Failed to insert into Postgres:", e)
 # ---- Optional: row count check ----
 with engine.connect() as conn: