Data_Lab/Scripts/branches.py
2025-12-10 15:59:45 -07:00

93 lines
3 KiB
Python

from faker import Faker
from dotenv import load_dotenv
import os
import pandas as pd
import boto3
import io
from sqlalchemy import create_engine, text
from urllib.parse import quote_plus
# ---- Faker setup ----
fake = Faker()
load_dotenv()
# ---- S3 Setup ----
s3 = boto3.resource(
's3',
endpoint_url=os.getenv('STORAGE_ENDPOINT'),
aws_access_key_id=os.getenv('STORAGE_ACCESS_KEY'),
aws_secret_access_key=os.getenv('STORAGE_SECRET_KEY')
)
bucket_name = os.getenv('STORAGE_BUCKET')
s3_key_csv = 'DataLab/branches/branches.csv'
s3_key_parquet = 'DataLab/branches/branches.parquet'
# ---- Postgres Setup ----
user = os.getenv("PG_USER")
password = os.getenv("PG_PASSWORD")
host = os.getenv("PG_HOST")
port = "5432"
db = "postgres"
engine = create_engine(f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{db}")
# ---- Ensure local data folder exists ----
os.makedirs("../Data", exist_ok=True)
# ---- Generate branch data ----
branches = []
for i in range(1, 11): # 10 branches
branches.append({
"branch_id": str(i), # store as string for consistency
"branch_name": f"{fake.city()} Branch",
"address": fake.street_address(),
"city": fake.city(),
"state": fake.state_abbr()
})
df = pd.DataFrame(branches)
# ---- Save locally as CSV ----
local_file = "../Data/branches.csv"
df.to_csv(local_file, index=False)
print("Generated 10 branches locally.")
# ---- Upload CSV to S3 ----
s3.Bucket(bucket_name).upload_file(local_file, s3_key_csv)
print(f"Uploaded branches.csv to s3://{bucket_name}/{s3_key_csv}")
# ---- Upload / append to S3 as Parquet ----
try:
obj = s3.Bucket(bucket_name).Object(s3_key_parquet).get()
existing_df = pd.read_parquet(io.BytesIO(obj['Body'].read()))
combined_df = pd.concat([existing_df, df], ignore_index=True)
print(f"Appended {len(df)} branches to existing Parquet on S3.")
except s3.meta.client.exceptions.NoSuchKey:
combined_df = df
print("No existing branches Parquet on S3, creating new one.")
parquet_buffer = io.BytesIO()
combined_df.to_parquet(parquet_buffer, index=False, engine="pyarrow")
s3.Bucket(bucket_name).put_object(Key=s3_key_parquet, Body=parquet_buffer.getvalue())
print(f"Uploaded branches.parquet to s3://{bucket_name}/{s3_key_parquet}")
# ---- Create / Append to Postgres ----
with engine.connect() as conn:
for _, row in df.iterrows():
stmt = text("""
INSERT INTO branches (branch_id, branch_name, address, city, state)
VALUES (:branch_id, :branch_name, :address, :city, :state)
ON CONFLICT (branch_id) DO NOTHING
""")
conn.execute(stmt, {
"branch_id": str(row["branch_id"]),
"branch_name": row["branch_name"],
"address": row["address"],
"city": row["city"],
"state": row["state"]
})
conn.commit()
# Optional: row count check
result = conn.execute(text("SELECT COUNT(*) FROM branches;"))
print(f"Rows in branches table: {result.scalar()}")