removed old version
This commit is contained in:
parent
455d52aafa
commit
34a1828338
1 changed files with 0 additions and 115 deletions
|
|
@ -1,115 +0,0 @@
|
||||||
from faker import Faker
|
|
||||||
from dotenv import load_dotenv
|
|
||||||
from datetime import datetime
|
|
||||||
import os
|
|
||||||
import random
|
|
||||||
import pandas as pd
|
|
||||||
import boto3
|
|
||||||
|
|
||||||
# ---- Setup ----
|
|
||||||
fake = Faker()
|
|
||||||
load_dotenv()
|
|
||||||
|
|
||||||
s3 = boto3.resource(
|
|
||||||
"s3",
|
|
||||||
endpoint_url=os.getenv("STORAGE_ENDPOINT"),
|
|
||||||
aws_access_key_id=os.getenv("STORAGE_ACCESS_KEY"),
|
|
||||||
aws_secret_access_key=os.getenv("STORAGE_SECRET_KEY")
|
|
||||||
)
|
|
||||||
|
|
||||||
bucket_name = os.getenv("STORAGE_BUCKET")
|
|
||||||
accounts_key = "DataLab/accounts/accounts.csv"
|
|
||||||
transactions_s3_key = "DataLab/transactions/transactions.csv"
|
|
||||||
|
|
||||||
# ---- Ensure local data folder exists ----
|
|
||||||
os.makedirs("../Data", exist_ok=True)
|
|
||||||
|
|
||||||
# ---- Download accounts.csv from S3 ----
|
|
||||||
local_accounts_file = "../Data/accounts.csv"
|
|
||||||
try:
|
|
||||||
s3.Bucket(bucket_name).download_file(accounts_key, local_accounts_file)
|
|
||||||
print("Downloaded accounts.csv from S3.")
|
|
||||||
except Exception as e:
|
|
||||||
print("ERROR: Could not download accounts.csv:", e)
|
|
||||||
raise SystemExit()
|
|
||||||
|
|
||||||
# ---- Load accounts DataFrame ----
|
|
||||||
accounts_df = pd.read_csv(local_accounts_file)
|
|
||||||
|
|
||||||
# ---- Sample vendors ----
|
|
||||||
vendors = ["Amazon", "Walmart", "Target", "Starbucks", "Apple", "Netflix", "Uber", "Lyft", "BestBuy", "Costco"]
|
|
||||||
|
|
||||||
# ---- Helper Functions ----
|
|
||||||
def generate_transaction_id(account_id, idx):
|
|
||||||
"""Generate a unique transaction ID combining account ID and index."""
|
|
||||||
return f"{account_id}{str(idx).zfill(5)}"
|
|
||||||
|
|
||||||
def generate_transaction(account):
|
|
||||||
"""Generate a realistic transaction for a given account."""
|
|
||||||
t_type = random.choices(
|
|
||||||
["Deposit", "Withdrawal", "Payment", "Transfer"],
|
|
||||||
weights=[0.4, 0.3, 0.2, 0.1], k=1
|
|
||||||
)[0]
|
|
||||||
|
|
||||||
transaction_data = {
|
|
||||||
"transaction_id": None, # fill later
|
|
||||||
"account_id": account['account_id'],
|
|
||||||
"branch_id": None,
|
|
||||||
"transaction_type": t_type,
|
|
||||||
"amount": 0,
|
|
||||||
"date": fake.date_between(start_date=pd.to_datetime(account['open_date']), end_date=datetime.today()),
|
|
||||||
"balance_after": 0,
|
|
||||||
"vendor": None,
|
|
||||||
"transaction_location": None
|
|
||||||
}
|
|
||||||
|
|
||||||
if t_type in ["Deposit", "Withdrawal"]:
|
|
||||||
# Pick one of the branches for deposit/withdrawal
|
|
||||||
transaction_data["branch_id"] = account['branch_id']
|
|
||||||
amount = round(random.uniform(50, 7000), 2) if t_type == "Withdrawal" else round(random.uniform(20, 10000), 2)
|
|
||||||
if t_type == "Withdrawal":
|
|
||||||
amount = min(amount, account['balance'])
|
|
||||||
account['balance'] -= amount
|
|
||||||
else:
|
|
||||||
account['balance'] += amount
|
|
||||||
transaction_data["amount"] = amount
|
|
||||||
transaction_data["balance_after"] = round(account['balance'], 2)
|
|
||||||
transaction_data["transaction_location"] = f"Branch {account['branch_id']}"
|
|
||||||
|
|
||||||
else: # Payment or Transfer
|
|
||||||
transaction_data["branch_id"] = None
|
|
||||||
transaction_data["vendor"] = random.choice(vendors)
|
|
||||||
amount = round(random.uniform(5, 1000), 2)
|
|
||||||
account['balance'] = max(account['balance'] - amount, 0)
|
|
||||||
transaction_data["amount"] = amount
|
|
||||||
transaction_data["balance_after"] = round(account['balance'], 2)
|
|
||||||
transaction_data["transaction_location"] = "POS / Online"
|
|
||||||
|
|
||||||
return transaction_data
|
|
||||||
|
|
||||||
# ---- Generate transactions ----
|
|
||||||
transactions = []
|
|
||||||
idx = 1
|
|
||||||
|
|
||||||
for _, account in accounts_df.iterrows():
|
|
||||||
account_transactions_count = random.randint(5, 20)
|
|
||||||
for _ in range(account_transactions_count):
|
|
||||||
txn = generate_transaction(account)
|
|
||||||
txn['transaction_id'] = generate_transaction_id(account['account_id'], idx)
|
|
||||||
transactions.append(txn)
|
|
||||||
idx += 1
|
|
||||||
|
|
||||||
# ---- Convert to DataFrame ----
|
|
||||||
transactions_df = pd.DataFrame(transactions)
|
|
||||||
|
|
||||||
# ---- Save locally ----
|
|
||||||
local_transactions_file = "../Data/transactions.csv"
|
|
||||||
transactions_df.to_csv(local_transactions_file, index=False)
|
|
||||||
print("Generated transactions.csv locally with realistic branch/vendor data.")
|
|
||||||
|
|
||||||
# ---- Upload to S3 ----
|
|
||||||
try:
|
|
||||||
s3.Bucket(bucket_name).upload_file(local_transactions_file, transactions_s3_key)
|
|
||||||
print(f"Uploaded transactions.csv to s3://{bucket_name}/{transactions_s3_key}")
|
|
||||||
except Exception as e:
|
|
||||||
print("ERROR: Could not upload transactions.csv to S3:", e)
|
|
||||||
Loading…
Reference in a new issue