deleted old version
This commit is contained in:
parent
6043afa20c
commit
572e06f436
1 changed files with 0 additions and 114 deletions
|
|
@ -1,114 +0,0 @@
|
||||||
from faker import Faker
|
|
||||||
from dotenv import load_dotenv
|
|
||||||
from datetime import datetime
|
|
||||||
import os
|
|
||||||
import random
|
|
||||||
import pandas as pd
|
|
||||||
import boto3
|
|
||||||
|
|
||||||
# ---- Setup ----
|
|
||||||
fake = Faker()
|
|
||||||
load_dotenv()
|
|
||||||
|
|
||||||
s3 = boto3.resource(
|
|
||||||
"s3",
|
|
||||||
endpoint_url=os.getenv("STORAGE_ENDPOINT"),
|
|
||||||
aws_access_key_id=os.getenv("STORAGE_ACCESS_KEY"),
|
|
||||||
aws_secret_access_key=os.getenv("STORAGE_SECRET_KEY")
|
|
||||||
)
|
|
||||||
|
|
||||||
bucket_name = os.getenv("STORAGE_BUCKET")
|
|
||||||
customers_key = "DataLab/customers/customers.csv"
|
|
||||||
accounts_s3_key = "DataLab/accounts/accounts.csv"
|
|
||||||
|
|
||||||
# ---- Ensure local data folder exists ----
|
|
||||||
os.makedirs("../Data", exist_ok=True)
|
|
||||||
|
|
||||||
# ---- Download customers.csv from S3 ----
|
|
||||||
local_customers_file = "../Data/customers.csv"
|
|
||||||
try:
|
|
||||||
s3.Bucket(bucket_name).download_file(customers_key, local_customers_file)
|
|
||||||
print("Downloaded customers.csv from S3.")
|
|
||||||
except Exception as e:
|
|
||||||
print("ERROR: Could not download customers.csv:", e)
|
|
||||||
raise SystemExit()
|
|
||||||
|
|
||||||
# ---- Load customers DataFrame ----
|
|
||||||
customers_df = pd.read_csv(local_customers_file)
|
|
||||||
|
|
||||||
# Convert customer_since to actual date objects
|
|
||||||
customers_df["customer_since"] = pd.to_datetime(customers_df["customer_since"]).dt.date
|
|
||||||
|
|
||||||
# ---- Helper Functions ----
|
|
||||||
|
|
||||||
def generate_account_id(branch_id):
|
|
||||||
"""Generate realistic branch-coded account IDs (11–12 digits)."""
|
|
||||||
branch_part = str(branch_id).zfill(3) # 3-digit branch ID
|
|
||||||
random_part = str(random.randint(10**8, 10**9 - 1)) # 8–9 random digits
|
|
||||||
return branch_part + random_part
|
|
||||||
|
|
||||||
def generate_account_number():
|
|
||||||
"""Generate realistic 11-digit bank account numbers."""
|
|
||||||
return str(random.randint(10**10, (10**11) - 1))
|
|
||||||
|
|
||||||
def assign_account_types():
|
|
||||||
"""
|
|
||||||
Assign 1–2 accounts per customer using realistic rules:
|
|
||||||
- ~50% Checking Only
|
|
||||||
- ~20% Savings Only
|
|
||||||
- ~30% Both
|
|
||||||
"""
|
|
||||||
roll = random.random()
|
|
||||||
|
|
||||||
if roll < 0.50:
|
|
||||||
return ["Checking"]
|
|
||||||
elif roll < 0.70:
|
|
||||||
return ["Savings"]
|
|
||||||
else:
|
|
||||||
return ["Checking", "Savings"]
|
|
||||||
|
|
||||||
def balance_for_type(account_type):
|
|
||||||
"""Give realistic account balances."""
|
|
||||||
if account_type == "Checking":
|
|
||||||
return round(random.uniform(50, 7000), 2)
|
|
||||||
else: # Savings
|
|
||||||
return round(random.uniform(200, 25000), 2)
|
|
||||||
|
|
||||||
# ---- Generate accounts ----
|
|
||||||
accounts = []
|
|
||||||
|
|
||||||
for _, row in customers_df.iterrows():
|
|
||||||
customer_id = row["customer_id"]
|
|
||||||
customer_since = row["customer_since"]
|
|
||||||
home_branch_id = row["home_branch_id"]
|
|
||||||
|
|
||||||
# Determine which account types this customer owns
|
|
||||||
account_types = assign_account_types()
|
|
||||||
|
|
||||||
for acct_type in account_types:
|
|
||||||
accounts.append({
|
|
||||||
"account_id": generate_account_id(home_branch_id),
|
|
||||||
"account_number": generate_account_number(),
|
|
||||||
"customer_id": customer_id,
|
|
||||||
"account_type": acct_type,
|
|
||||||
"open_date": fake.date_between(
|
|
||||||
start_date=customer_since, end_date=datetime.today().date()
|
|
||||||
),
|
|
||||||
"balance": balance_for_type(acct_type),
|
|
||||||
"branch_id": home_branch_id
|
|
||||||
})
|
|
||||||
|
|
||||||
# ---- Convert to DataFrame ----
|
|
||||||
accounts_df = pd.DataFrame(accounts)
|
|
||||||
|
|
||||||
# ---- Save locally ----
|
|
||||||
local_accounts_file = "../Data/accounts.csv"
|
|
||||||
accounts_df.to_csv(local_accounts_file, index=False)
|
|
||||||
print("Generated accounts.csv locally.")
|
|
||||||
|
|
||||||
# ---- Upload to S3 ----
|
|
||||||
try:
|
|
||||||
s3.Bucket(bucket_name).upload_file(local_accounts_file, accounts_s3_key)
|
|
||||||
print(f"Uploaded accounts.csv to s3://{bucket_name}/{accounts_s3_key}")
|
|
||||||
except Exception as e:
|
|
||||||
print("ERROR: Could not upload accounts.csv to S3:", e)
|
|
||||||
Loading…
Reference in a new issue