Monte Carlo Benchmarking Engine
High-performance SIMD Monte Carlo engine (AVX2/NEON) with custom memory allocators and perf logging.
 
Loading...
Searching...
No Matches
utils.py
Go to the documentation of this file.
1# ===========================================
2# utils.py
3# ===========================================
4
5
26
27
28from pipeline.schema import SCHEMA
29from polars import col, when
30import polars as pl
31
32
33def safe_vector_cast(df: pl.DataFrame, schema: dict) -> pl.DataFrame:
34 """!Cast a Polars DataFrame to match a declared schema, handling 'NA' strings as nulls.
35
36 This function enforces schema alignment between a raw input DataFrame (typically from CSV)
37 and a declared schema. If `allow_na` is True in the schema, string values like "NA" will be
38 replaced with nulls prior to casting.
39
40 @param df The input Polars DataFrame to cast.
41 @param schema Dictionary in the format { column_name: (dtype, allow_na) }.
42
43 @return A new Polars DataFrame with all fields casted according to the schema.
44
45 @throws ValueError If any schema field is missing in the DataFrame.
46 """
47 try:
48 missing = [c for c in schema if c not in df.columns]
49
50 if missing:
51 print("SCHEMA MISMATCH DETECTED")
52 print("Expected columns (from schema):")
53 for s in schema:
54 print(f" {s}")
55 print("Found columns (in DataFrame):")
56 for c in df.columns:
57 print(f" {c}")
58 print("Missing columns:")
59 for m in missing:
60 print(f" {m}")
61 raise ValueError(f"Schema mismatch: {len(missing)} missing column(s)")
62
63 casted = []
64
65 for c, (dtype, allow_na) in schema.items():
66 if allow_na and df[c].dtype == pl.Utf8:
67 expr = when(col(c) == "NA").then(None).otherwise(col(c)).cast(dtype).alias(c)
68 else:
69 expr = col(c).cast(dtype).alias(c)
70 casted.append(expr)
71
72 return df.with_columns(casted)
73
74 except Exception as e:
75 print("[ERROR] safe_vector_cast failed:", e)
76 print("[DEBUG] DataFrame columns:", df.columns)
77 print("[DEBUG] Schema fields:", list(SCHEMA.keys()))
78 raise e
79
80def safe_div(numerator, denominator):
81 """!Safely performs division, handling 'NA' values and invalid input.
82
83 Returns a rounded division result unless input is invalid or contains the string "NA",
84 in which case "NA" is returned instead.
85
86 @param numerator Numerator of the division (can be int, float, or "NA").
87 @param denominator Denominator of the division (can be int, float, or "NA").
88
89 @return Result of division rounded to 4 decimal places, or "NA" if invalid.
90 """
91 try:
92 if "NA" in (numerator, denominator):
93 return "NA"
94
95 num = float(numerator)
96 denom = float(denominator)
97
98 return round(num / denom, 4)
99 except:
100 return "NA"
101
102def safe_div_percent(numerator, denominator):
103 """!Computes percentage-based division safely, with 'NA' fallback.
104
105 Similar to safe_div, but multiplies the result by 100 to express it as a percent.
106 Invalid input or "NA" strings will return "NA" as a string.
107
108 @param numerator Numerator of the division (can be int, float, or "NA").
109 @param denominator Denominator of the division (can be int, float, or "NA").
110
111 @return Percentage value (rounded to 4 decimals), or "NA" if invalid.
112 """
113 try:
114 if "NA" in (numerator, denominator):
115 return "NA"
116
117 num = float(numerator)
118 denom = float(denominator)
119
120 return round((num / denom) * 100, 4)
121 except:
122 return "NA"
safe_div(numerator, denominator)
Safely performs division, handling 'NA' values and invalid input.
Definition utils.py:80
safe_div_percent(numerator, denominator)
Computes percentage-based division safely, with 'NA' fallback.
Definition utils.py:102
pl.DataFrame safe_vector_cast(pl.DataFrame df, dict schema)
Cast a Polars DataFrame to match a declared schema, handling 'NA' strings as nulls.
Definition utils.py:33