Data_Lab/Scripts/customers.py

from sqlalchemy import create_engine, text
from urllib.parse import quote_plus
from faker import Faker
from dotenv import load_dotenv
import os
import io
import pandas as pd
import boto3
import random
from datetime import datetime

# ---- Load env ----
load_dotenv()
fake = Faker()

# ---- Postgres setup ----
user = os.getenv("PG_USER")
password = quote_plus(os.getenv("PG_PASSWORD"))
host = os.getenv("PG_HOST")
port = "5432"
db = "postgres"
engine = create_engine(f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{db}")

# ---- Hetzner S3 setup ---- (backup only) ----
s3 = boto3.resource(
    "s3",
    endpoint_url=os.getenv("STORAGE_ENDPOINT"),
    aws_access_key_id=os.getenv("STORAGE_ACCESS_KEY"),
    aws_secret_access_key=os.getenv("STORAGE_SECRET_KEY")
)
bucket_name = os.getenv("STORAGE_BUCKET")
branches_s3_key = "DataLab/branches/branches.csv"
customers_s3_key = "DataLab/customers/customers.parquet"

# ---- Load branches from S3 (still needed for customer assignment) ----
branches_local = "../Data/branches.csv"
s3.Bucket(bucket_name).download_file(branches_s3_key, branches_local)
branches = pd.read_csv(branches_local)

# ---- Load existing customers from Postgres for email uniqueness ----
with engine.connect() as conn:
    table_exists = conn.execute(
        text("SELECT EXISTS (SELECT 1 FROM information_schema.tables WHERE table_name='customers');")
    ).scalar()

    if table_exists:
        existing_customers = pd.read_sql(
            text("SELECT email FROM customers;"),
            con=conn
        )
        existing_emails = set(existing_customers["email"]) if not existing_customers.empty else set()
    else:
        existing_emails = set()


# ---- Helper functions ----
def realistic_credit_score():
    return max(300, min(int(random.gauss(680, 60)), 850))

def realistic_income():
    brackets = [(20000,40000),(40000,70000),(70000,120000),(120000,200000)]
    low, high = random.choice(brackets)
    return random.randint(low, high)

def realistic_employment():
    return random.choices(
        ["Employed","Self-Employed","Unemployed","Student","Retired"],
        weights=[50,15,10,15,10]
    )[0]

def realistic_contact():
    return random.choice(["Email","Phone","SMS"])

def generate_customer_id():
    return random.getrandbits(48)

# ---- Generate Customers ----
customers = []
for _ in range(50):
    first = fake.first_name()
    last = fake.last_name()
    email = f"{first.lower()}.{last.lower()}@{fake.free_email_domain()}"

    while email in existing_emails:
        first = fake.first_name()
        last = fake.last_name()
        email = f"{first.lower()}.{last.lower()}@{fake.free_email_domain()}"
    existing_emails.add(email)

    dob = fake.date_between(start_date="-80y", end_date="-18y")
    age = (datetime.now().date() - dob).days // 365
    income = realistic_income()
    credit = realistic_credit_score()

    customers.append({
        "customer_id": generate_customer_id(),
        "full_name": f"{first} {last}",
        "email": email,
        "phone": fake.phone_number(),
        "date_of_birth": dob,
        "age": age,
        "gender": random.choice(["Male","Female","Other"]),
        "street_address": fake.street_address(),
        "city": fake.city(),
        "state": fake.state_abbr(),
        "zip_code": fake.zipcode(),
        "home_branch_id": random.choice(branches["branch_id"]),
        "customer_since": fake.date_between(start_date="-10y", end_date="today"),
        "employment_status": realistic_employment(),
        "annual_income": income,
        "credit_score": credit,
        "preferred_contact_method": realistic_contact()
    })

df = pd.DataFrame(customers)

# ---- Save to S3 backup ----
buffer = io.BytesIO()
df.to_parquet(buffer, index=False, engine="pyarrow")
s3.Bucket(bucket_name).put_object(Key=customers_s3_key, Body=buffer.getvalue())
print("Uploaded customers.parquet to S3 (backup).")

# ---- Insert into Postgres ----
df.to_sql("customers", engine, if_exists="append", index=False, method="multi")
print("Inserted customers into Postgres successfully!")