updated script logic and generated sample data
This commit is contained in:
parent
5e911b4690
commit
125aa5e122
3 changed files with 338 additions and 0 deletions
114
Scripts/Generate_accounts.py
Normal file
114
Scripts/Generate_accounts.py
Normal file
|
|
@ -0,0 +1,114 @@
|
|||
from faker import Faker
|
||||
from dotenv import load_dotenv
|
||||
from datetime import datetime
|
||||
import os
|
||||
import random
|
||||
import pandas as pd
|
||||
import boto3
|
||||
|
||||
# ---- Setup ----
|
||||
fake = Faker()
|
||||
load_dotenv()
|
||||
|
||||
s3 = boto3.resource(
|
||||
"s3",
|
||||
endpoint_url=os.getenv("STORAGE_ENDPOINT"),
|
||||
aws_access_key_id=os.getenv("STORAGE_ACCESS_KEY"),
|
||||
aws_secret_access_key=os.getenv("STORAGE_SECRET_KEY")
|
||||
)
|
||||
|
||||
bucket_name = os.getenv("STORAGE_BUCKET")
|
||||
customers_key = "DataLab/customers/customers.csv"
|
||||
accounts_s3_key = "DataLab/accounts/accounts.csv"
|
||||
|
||||
# ---- Ensure local data folder exists ----
|
||||
os.makedirs("../Data", exist_ok=True)
|
||||
|
||||
# ---- Download customers.csv from S3 ----
|
||||
local_customers_file = "../Data/customers.csv"
|
||||
try:
|
||||
s3.Bucket(bucket_name).download_file(customers_key, local_customers_file)
|
||||
print("Downloaded customers.csv from S3.")
|
||||
except Exception as e:
|
||||
print("ERROR: Could not download customers.csv:", e)
|
||||
raise SystemExit()
|
||||
|
||||
# ---- Load customers DataFrame ----
|
||||
customers_df = pd.read_csv(local_customers_file)
|
||||
|
||||
# Convert customer_since to actual date objects
|
||||
customers_df["customer_since"] = pd.to_datetime(customers_df["customer_since"]).dt.date
|
||||
|
||||
# ---- Helper Functions ----
|
||||
|
||||
def generate_account_id(branch_id):
|
||||
"""Generate realistic branch-coded account IDs (11–12 digits)."""
|
||||
branch_part = str(branch_id).zfill(3) # 3-digit branch ID
|
||||
random_part = str(random.randint(10**8, 10**9 - 1)) # 8–9 random digits
|
||||
return branch_part + random_part
|
||||
|
||||
def generate_account_number():
|
||||
"""Generate realistic 11-digit bank account numbers."""
|
||||
return str(random.randint(10**10, (10**11) - 1))
|
||||
|
||||
def assign_account_types():
|
||||
"""
|
||||
Assign 1–2 accounts per customer using realistic rules:
|
||||
- ~50% Checking Only
|
||||
- ~20% Savings Only
|
||||
- ~30% Both
|
||||
"""
|
||||
roll = random.random()
|
||||
|
||||
if roll < 0.50:
|
||||
return ["Checking"]
|
||||
elif roll < 0.70:
|
||||
return ["Savings"]
|
||||
else:
|
||||
return ["Checking", "Savings"]
|
||||
|
||||
def balance_for_type(account_type):
|
||||
"""Give realistic account balances."""
|
||||
if account_type == "Checking":
|
||||
return round(random.uniform(50, 7000), 2)
|
||||
else: # Savings
|
||||
return round(random.uniform(200, 25000), 2)
|
||||
|
||||
# ---- Generate accounts ----
|
||||
accounts = []
|
||||
|
||||
for _, row in customers_df.iterrows():
|
||||
customer_id = row["customer_id"]
|
||||
customer_since = row["customer_since"]
|
||||
home_branch_id = row["home_branch_id"]
|
||||
|
||||
# Determine which account types this customer owns
|
||||
account_types = assign_account_types()
|
||||
|
||||
for acct_type in account_types:
|
||||
accounts.append({
|
||||
"account_id": generate_account_id(home_branch_id),
|
||||
"account_number": generate_account_number(),
|
||||
"customer_id": customer_id,
|
||||
"account_type": acct_type,
|
||||
"open_date": fake.date_between(
|
||||
start_date=customer_since, end_date=datetime.today().date()
|
||||
),
|
||||
"balance": balance_for_type(acct_type),
|
||||
"branch_id": home_branch_id
|
||||
})
|
||||
|
||||
# ---- Convert to DataFrame ----
|
||||
accounts_df = pd.DataFrame(accounts)
|
||||
|
||||
# ---- Save locally ----
|
||||
local_accounts_file = "../Data/accounts.csv"
|
||||
accounts_df.to_csv(local_accounts_file, index=False)
|
||||
print("Generated accounts.csv locally.")
|
||||
|
||||
# ---- Upload to S3 ----
|
||||
try:
|
||||
s3.Bucket(bucket_name).upload_file(local_accounts_file, accounts_s3_key)
|
||||
print(f"Uploaded accounts.csv to s3://{bucket_name}/{accounts_s3_key}")
|
||||
except Exception as e:
|
||||
print("ERROR: Could not upload accounts.csv to S3:", e)
|
||||
109
Scripts/Generate_customers.py
Normal file
109
Scripts/Generate_customers.py
Normal file
|
|
@ -0,0 +1,109 @@
|
|||
from faker import Faker
|
||||
from dotenv import load_dotenv
|
||||
import os
|
||||
import pandas as pd
|
||||
import boto3
|
||||
import random
|
||||
from datetime import datetime
|
||||
|
||||
fake = Faker()
|
||||
|
||||
# ---- Load env ----
|
||||
load_dotenv()
|
||||
|
||||
# ---- Hetzner S3 setup ----
|
||||
s3 = boto3.resource(
|
||||
"s3",
|
||||
endpoint_url=os.getenv("STORAGE_ENDPOINT"),
|
||||
aws_access_key_id=os.getenv("STORAGE_ACCESS_KEY"),
|
||||
aws_secret_access_key=os.getenv("STORAGE_SECRET_KEY")
|
||||
)
|
||||
|
||||
bucket_name = os.getenv("STORAGE_BUCKET")
|
||||
customers_s3_key = "DataLab/customers/customers.csv"
|
||||
branches_s3_key = "DataLab/branches/branches.csv"
|
||||
|
||||
# ---- Load branches from S3 ----
|
||||
branches_local = "../Data/branches.csv"
|
||||
s3.Bucket(bucket_name).download_file(branches_s3_key, branches_local)
|
||||
branches = pd.read_csv(branches_local)
|
||||
|
||||
# ---- Helper functions ----
|
||||
def realistic_credit_score():
|
||||
"""Normal distribution around 680."""
|
||||
score = int(random.gauss(680, 60))
|
||||
return max(300, min(score, 850))
|
||||
|
||||
def realistic_income():
|
||||
brackets = [
|
||||
(20000, 40000),
|
||||
(40000, 70000),
|
||||
(70000, 120000),
|
||||
(120000, 200000)
|
||||
]
|
||||
low, high = random.choice(brackets)
|
||||
return random.randint(low, high)
|
||||
|
||||
def realistic_employment():
|
||||
return random.choices(
|
||||
["Employed", "Self-Employed", "Unemployed", "Student", "Retired"],
|
||||
weights=[50, 15, 10, 15, 10]
|
||||
)[0]
|
||||
|
||||
def realistic_contact():
|
||||
return random.choice(["Email", "Phone", "SMS"])
|
||||
|
||||
# ---- Generate Customers ----
|
||||
customers = []
|
||||
start_id = 100000 # Realistic banking customer IDs
|
||||
|
||||
for i in range(50):
|
||||
first = fake.first_name()
|
||||
last = fake.last_name()
|
||||
|
||||
dob = fake.date_between(start_date="-80y", end_date="-18y")
|
||||
age = (datetime.now().date() - dob).days // 365
|
||||
|
||||
income = realistic_income()
|
||||
credit = realistic_credit_score()
|
||||
|
||||
customers.append({
|
||||
"customer_id": start_id + i,
|
||||
"first_name": first,
|
||||
"last_name": last,
|
||||
"full_name": f"{first} {last}",
|
||||
"email": f"{first.lower()}.{last.lower()}@{fake.free_email_domain()}",
|
||||
"phone": fake.phone_number(),
|
||||
"date_of_birth": dob,
|
||||
"age": age,
|
||||
"gender": random.choice(["Male", "Female", "Other"]),
|
||||
"street_address": fake.street_address(),
|
||||
"city": fake.city(),
|
||||
"state": fake.state_abbr(),
|
||||
"zip_code": fake.zipcode(),
|
||||
"home_branch_id": random.choice(branches["branch_id"]),
|
||||
"customer_since": fake.date_between(start_date="-10y", end_date="today"),
|
||||
"employment_status": realistic_employment(),
|
||||
"annual_income": income,
|
||||
"credit_score": credit,
|
||||
"preferred_contact_method": realistic_contact(),
|
||||
"is_high_value_customer": income > 120000 or credit > 750,
|
||||
"age_group": (
|
||||
"18-25" if age < 26 else
|
||||
"26-35" if age < 36 else
|
||||
"36-50" if age < 51 else
|
||||
"51-65" if age < 66 else
|
||||
"66+"
|
||||
)
|
||||
})
|
||||
|
||||
df = pd.DataFrame(customers)
|
||||
|
||||
# ---- Save locally ----
|
||||
local_file = "../Data/customers.csv"
|
||||
df.to_csv(local_file, index=False)
|
||||
print("Generated realistic customers.")
|
||||
|
||||
# ---- Upload to S3 ----
|
||||
s3.Bucket(bucket_name).upload_file(local_file, customers_s3_key)
|
||||
print(f"Uploaded customers.csv to s3://{bucket_name}/{customers_s3_key}")
|
||||
115
Scripts/Generate_transactions.py
Normal file
115
Scripts/Generate_transactions.py
Normal file
|
|
@ -0,0 +1,115 @@
|
|||
from faker import Faker
|
||||
from dotenv import load_dotenv
|
||||
from datetime import datetime
|
||||
import os
|
||||
import random
|
||||
import pandas as pd
|
||||
import boto3
|
||||
|
||||
# ---- Setup ----
|
||||
fake = Faker()
|
||||
load_dotenv()
|
||||
|
||||
s3 = boto3.resource(
|
||||
"s3",
|
||||
endpoint_url=os.getenv("STORAGE_ENDPOINT"),
|
||||
aws_access_key_id=os.getenv("STORAGE_ACCESS_KEY"),
|
||||
aws_secret_access_key=os.getenv("STORAGE_SECRET_KEY")
|
||||
)
|
||||
|
||||
bucket_name = os.getenv("STORAGE_BUCKET")
|
||||
accounts_key = "DataLab/accounts/accounts.csv"
|
||||
transactions_s3_key = "DataLab/transactions/transactions.csv"
|
||||
|
||||
# ---- Ensure local data folder exists ----
|
||||
os.makedirs("../Data", exist_ok=True)
|
||||
|
||||
# ---- Download accounts.csv from S3 ----
|
||||
local_accounts_file = "../Data/accounts.csv"
|
||||
try:
|
||||
s3.Bucket(bucket_name).download_file(accounts_key, local_accounts_file)
|
||||
print("Downloaded accounts.csv from S3.")
|
||||
except Exception as e:
|
||||
print("ERROR: Could not download accounts.csv:", e)
|
||||
raise SystemExit()
|
||||
|
||||
# ---- Load accounts DataFrame ----
|
||||
accounts_df = pd.read_csv(local_accounts_file)
|
||||
|
||||
# ---- Sample vendors ----
|
||||
vendors = ["Amazon", "Walmart", "Target", "Starbucks", "Apple", "Netflix", "Uber", "Lyft", "BestBuy", "Costco"]
|
||||
|
||||
# ---- Helper Functions ----
|
||||
def generate_transaction_id(account_id, idx):
|
||||
"""Generate a unique transaction ID combining account ID and index."""
|
||||
return f"{account_id}{str(idx).zfill(5)}"
|
||||
|
||||
def generate_transaction(account):
|
||||
"""Generate a realistic transaction for a given account."""
|
||||
t_type = random.choices(
|
||||
["Deposit", "Withdrawal", "Payment", "Transfer"],
|
||||
weights=[0.4, 0.3, 0.2, 0.1], k=1
|
||||
)[0]
|
||||
|
||||
transaction_data = {
|
||||
"transaction_id": None, # fill later
|
||||
"account_id": account['account_id'],
|
||||
"branch_id": None,
|
||||
"transaction_type": t_type,
|
||||
"amount": 0,
|
||||
"date": fake.date_between(start_date=pd.to_datetime(account['open_date']), end_date=datetime.today()),
|
||||
"balance_after": 0,
|
||||
"vendor": None,
|
||||
"transaction_location": None
|
||||
}
|
||||
|
||||
if t_type in ["Deposit", "Withdrawal"]:
|
||||
# Pick one of the branches for deposit/withdrawal
|
||||
transaction_data["branch_id"] = account['branch_id']
|
||||
amount = round(random.uniform(50, 7000), 2) if t_type == "Withdrawal" else round(random.uniform(20, 10000), 2)
|
||||
if t_type == "Withdrawal":
|
||||
amount = min(amount, account['balance'])
|
||||
account['balance'] -= amount
|
||||
else:
|
||||
account['balance'] += amount
|
||||
transaction_data["amount"] = amount
|
||||
transaction_data["balance_after"] = round(account['balance'], 2)
|
||||
transaction_data["transaction_location"] = f"Branch {account['branch_id']}"
|
||||
|
||||
else: # Payment or Transfer
|
||||
transaction_data["branch_id"] = None
|
||||
transaction_data["vendor"] = random.choice(vendors)
|
||||
amount = round(random.uniform(5, 1000), 2)
|
||||
account['balance'] = max(account['balance'] - amount, 0)
|
||||
transaction_data["amount"] = amount
|
||||
transaction_data["balance_after"] = round(account['balance'], 2)
|
||||
transaction_data["transaction_location"] = "POS / Online"
|
||||
|
||||
return transaction_data
|
||||
|
||||
# ---- Generate transactions ----
|
||||
transactions = []
|
||||
idx = 1
|
||||
|
||||
for _, account in accounts_df.iterrows():
|
||||
account_transactions_count = random.randint(5, 20)
|
||||
for _ in range(account_transactions_count):
|
||||
txn = generate_transaction(account)
|
||||
txn['transaction_id'] = generate_transaction_id(account['account_id'], idx)
|
||||
transactions.append(txn)
|
||||
idx += 1
|
||||
|
||||
# ---- Convert to DataFrame ----
|
||||
transactions_df = pd.DataFrame(transactions)
|
||||
|
||||
# ---- Save locally ----
|
||||
local_transactions_file = "../Data/transactions.csv"
|
||||
transactions_df.to_csv(local_transactions_file, index=False)
|
||||
print("Generated transactions.csv locally with realistic branch/vendor data.")
|
||||
|
||||
# ---- Upload to S3 ----
|
||||
try:
|
||||
s3.Bucket(bucket_name).upload_file(local_transactions_file, transactions_s3_key)
|
||||
print(f"Uploaded transactions.csv to s3://{bucket_name}/{transactions_s3_key}")
|
||||
except Exception as e:
|
||||
print("ERROR: Could not upload transactions.csv to S3:", e)
|
||||
Loading…
Reference in a new issue