removed old version
This commit is contained in:
parent
7dc9ecc6db
commit
455d52aafa
1 changed files with 0 additions and 101 deletions
|
|
@ -1,101 +0,0 @@
|
|||
from faker import Faker
|
||||
from dotenv import load_dotenv
|
||||
import os
|
||||
import pandas as pd
|
||||
import boto3
|
||||
import random
|
||||
from datetime import datetime
|
||||
|
||||
fake = Faker()
|
||||
|
||||
# ---- Load env ----
|
||||
load_dotenv()
|
||||
|
||||
# ---- Hetzner S3 setup ----
|
||||
s3 = boto3.resource(
|
||||
"s3",
|
||||
endpoint_url=os.getenv("STORAGE_ENDPOINT"),
|
||||
aws_access_key_id=os.getenv("STORAGE_ACCESS_KEY"),
|
||||
aws_secret_access_key=os.getenv("STORAGE_SECRET_KEY")
|
||||
)
|
||||
|
||||
bucket_name = os.getenv("STORAGE_BUCKET")
|
||||
customers_s3_key = "DataLab/customers/customers.csv"
|
||||
branches_s3_key = "DataLab/branches/branches.csv"
|
||||
|
||||
# ---- Load branches from S3 ----
|
||||
branches_local = "../Data/branches.csv"
|
||||
s3.Bucket(bucket_name).download_file(branches_s3_key, branches_local)
|
||||
branches = pd.read_csv(branches_local)
|
||||
|
||||
# ---- Helper functions ----
|
||||
def realistic_credit_score():
|
||||
"""Normal distribution around 680."""
|
||||
score = int(random.gauss(680, 60))
|
||||
return max(300, min(score, 850))
|
||||
|
||||
def realistic_income():
|
||||
brackets = [
|
||||
(20000, 40000),
|
||||
(40000, 70000),
|
||||
(70000, 120000),
|
||||
(120000, 200000)
|
||||
]
|
||||
low, high = random.choice(brackets)
|
||||
return random.randint(low, high)
|
||||
|
||||
def realistic_employment():
|
||||
return random.choices(
|
||||
["Employed", "Self-Employed", "Unemployed", "Student", "Retired"],
|
||||
weights=[50, 15, 10, 15, 10]
|
||||
)[0]
|
||||
|
||||
def realistic_contact():
|
||||
return random.choice(["Email", "Phone", "SMS"])
|
||||
|
||||
# ---- Generate Customers ----
|
||||
customers = []
|
||||
start_id = 100000 # Realistic banking customer IDs
|
||||
|
||||
for i in range(50):
|
||||
first = fake.first_name()
|
||||
last = fake.last_name()
|
||||
|
||||
dob = fake.date_between(start_date="-80y", end_date="-18y")
|
||||
age = (datetime.now().date() - dob).days // 365
|
||||
|
||||
income = realistic_income()
|
||||
credit = realistic_credit_score()
|
||||
|
||||
customers.append({
|
||||
"customer_id": start_id + i,
|
||||
"first_name": first,
|
||||
"last_name": last,
|
||||
"full_name": f"{first} {last}",
|
||||
"email": f"{first.lower()}.{last.lower()}@{fake.free_email_domain()}",
|
||||
"phone": fake.phone_number(),
|
||||
"date_of_birth": dob,
|
||||
"age": age,
|
||||
"gender": random.choice(["Male", "Female", "Other"]),
|
||||
"street_address": fake.street_address(),
|
||||
"city": fake.city(),
|
||||
"state": fake.state_abbr(),
|
||||
"zip_code": fake.zipcode(),
|
||||
"home_branch_id": random.choice(branches["branch_id"]),
|
||||
"customer_since": fake.date_between(start_date="-10y", end_date="today"),
|
||||
"employment_status": realistic_employment(),
|
||||
"annual_income": income,
|
||||
"credit_score": credit,
|
||||
"preferred_contact_method": realistic_contact()
|
||||
})
|
||||
|
||||
df = pd.DataFrame(customers)
|
||||
|
||||
# ---- Save locally ----
|
||||
local_file = "../Data/customers.csv"
|
||||
df.to_csv(local_file, index=False)
|
||||
print("Generated realistic customers.")
|
||||
|
||||
# ---- Upload to S3 ----
|
||||
s3.Bucket(bucket_name).upload_file(local_file, customers_s3_key)
|
||||
print(f"Uploaded customers.csv to s3://{bucket_name}/{customers_s3_key}")
|
||||
Loading…
Reference in a new issue