Source code for sheetwise.classifiers
"""Data type classification utilities for spreadsheet cells."""
import re
from typing import Any
import pandas as pd
[docs]
class DataTypeClassifier:
"""Rule-based classifier for identifying data types, PII, and business entities."""
[docs]
@staticmethod
def classify_cell_type(value: Any) -> str:
"""Classify cell value into predefined data types."""
if pd.isna(value) or value == "" or value is None:
return "Empty"
str_value = str(value).strip()
# Check if value is effectively empty after stripping
if not str_value:
return "Empty"
# --- Business & PII Logic (High Priority) ---
# Email
if re.match(r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$", str_value):
return "PII_Email"
# US Phone Number (Loose match)
if re.match(r"^(\+\d{1,2}\s)?\(?\d{3}\)?[\s.-]\d{3}[\s.-]\d{4}$", str_value):
return "PII_Phone"
# SSN (Simple US format)
if re.match(r"^\d{3}-\d{2}-\d{4}$", str_value):
return "PII_SSN"
# IBAN (International Bank Account Number) - Generic structure
if re.match(r"^[A-Z]{2}\d{2}[A-Z0-9]{1,30}$", str_value):
return "Biz_IBAN"
# Ticker Symbol (e.g., $AAPL, AAPL) - Heuristic: All caps, 2-5 chars
if re.match(r"^\$?[A-Z]{2,5}$", str_value) and not re.match(r"^\d+$", str_value):
return "Biz_Ticker"
# --- Standard Types (Lower Priority) ---
# Year pattern
if re.match(r"^\d{4}$", str_value) and 1900 <= int(str_value) <= 2100:
return "Year"
# Scientific notation
if re.match(r"^-?\d+\.?\d*[eE][+-]?\d+$", str_value):
return "Scientific"
# Integer
try:
int(str_value.replace(",", ""))
if "." not in str_value:
return "Integer"
except ValueError:
pass
# Float
try:
float(str_value.replace(",", ""))
return "Float"
except ValueError:
pass
# Percentage
if str_value.endswith("%"):
return "Percentage"
# Date patterns
date_patterns = [
r"^\d{4}[-/]\d{1,2}[-/]\d{1,2}$",
r"^\d{1,2}[-/]\d{1,2}[-/]\d{4}$",
r"^\d{1,2}-\w{3}-\d{4}$",
]
for pattern in date_patterns:
if re.match(pattern, str_value):
return "Date"
# Time patterns
if re.match(r"^\d{1,2}:\d{2}(:\d{2})?(\s?(AM|PM))?$", str_value, re.IGNORECASE):
return "Time"
# Currency
currency_symbols = ["$", "€", "£", "¥", "₹"]
if any(symbol in str_value for symbol in currency_symbols):
return "Currency"
return "String"