29from polars
import col, when
34 """!Cast a Polars DataFrame to match a declared schema, handling 'NA' strings as nulls.
36 This function enforces schema alignment between a raw input DataFrame (typically from CSV)
37 and a declared schema. If `allow_na` is True in the schema, string values like "NA" will be
38 replaced with nulls prior to casting.
40 @param df The input Polars DataFrame to cast.
41 @param schema Dictionary in the format { column_name: (dtype, allow_na) }.
43 @return A new Polars DataFrame with all fields casted according to the schema.
45 @throws ValueError If any schema field is missing in the DataFrame.
48 missing = [c
for c
in schema
if c
not in df.columns]
51 print(
"SCHEMA MISMATCH DETECTED")
52 print(
"Expected columns (from schema):")
55 print(
"Found columns (in DataFrame):")
58 print(
"Missing columns:")
61 raise ValueError(f
"Schema mismatch: {len(missing)} missing column(s)")
65 for c, (dtype, allow_na)
in schema.items():
66 if allow_na
and df[c].dtype == pl.Utf8:
67 expr = when(col(c) ==
"NA").then(
None).otherwise(col(c)).cast(dtype).alias(c)
69 expr = col(c).cast(dtype).alias(c)
72 return df.with_columns(casted)
74 except Exception
as e:
75 print(
"[ERROR] safe_vector_cast failed:", e)
76 print(
"[DEBUG] DataFrame columns:", df.columns)
77 print(
"[DEBUG] Schema fields:", list(SCHEMA.keys()))
81 """!Safely performs division, handling 'NA' values and invalid input.
83 Returns a rounded division result unless input is invalid or contains the string "NA",
84 in which case "NA" is returned instead.
86 @param numerator Numerator of the division (can be int, float, or "NA").
87 @param denominator Denominator of the division (can be int, float, or "NA").
89 @return Result of division rounded to 4 decimal places, or "NA" if invalid.
92 if "NA" in (numerator, denominator):
95 num = float(numerator)
96 denom = float(denominator)
98 return round(num / denom, 4)
103 """!Computes percentage-based division safely, with 'NA' fallback.
105 Similar to safe_div, but multiplies the result by 100 to express it as a percent.
106 Invalid input or "NA" strings will return "NA" as a string.
108 @param numerator Numerator of the division (can be int, float, or "NA").
109 @param denominator Denominator of the division (can be int, float, or "NA").
111 @return Percentage value (rounded to 4 decimals), or "NA" if invalid.
114 if "NA" in (numerator, denominator):
117 num = float(numerator)
118 denom = float(denominator)
120 return round((num / denom) * 100, 4)
safe_div(numerator, denominator)
Safely performs division, handling 'NA' values and invalid input.
safe_div_percent(numerator, denominator)
Computes percentage-based division safely, with 'NA' fallback.
pl.DataFrame safe_vector_cast(pl.DataFrame df, dict schema)
Cast a Polars DataFrame to match a declared schema, handling 'NA' strings as nulls.