# How this model was built

This notebook shows how we cleaned the autism screening data, engineered features, and trained a simple neural‑network style model (logistic regression in PyTorch) to estimate a probability.

- Goal: Estimate ASD likelihood from A1–A10 items and demographics
- Steps: Load → Clean/Encode → Split/Scale → Train (PyTorch) → Evaluate
- Note: Educational only; not medical advice. Use responsibly and validate before any clinical use.


In [None]:
# Imports & reproducibility
import os, random, math
import numpy as np
import pandas as pd

import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

print(f"Using torch {torch.__version__}")


In [None]:
# Load, clean, encode
CSV_PATH = 'autism_screening.csv'
assert os.path.exists(CSV_PATH), f"CSV not found at {CSV_PATH}"

def to_snake(s: str) -> str:
    return (
        s.strip().lower().replace('/', ' ').replace('-', ' ')
        .replace('  ', ' ').replace(' ', '_')
    )

# 1) Load
raw = pd.read_csv(CSV_PATH)
raw.columns = [to_snake(c) for c in raw.columns]
print(raw.shape, list(raw.columns)[:10])

# 2) Basic normalization
for c in raw.select_dtypes(include='object').columns:
    raw[c] = raw[c].astype(str).str.strip().str.lower().replace({'?': np.nan, 'nan': np.nan})

# Boolean mappings commonly used in this dataset
bool_map = {'yes': 1, 'no': 0}
for c in ['jundice','austim','used_app_before']:
    if c in raw.columns:
        raw[c] = raw[c].map(bool_map)

if 'gender' in raw.columns:
    raw['gender'] = raw['gender'].map({'m': 1, 'f': 0}).fillna(0)

# A-scores (A1..A10) → ensure ints; also create result total
as_cols = [f'a{i}_score' for i in range(1, 11) if f'a{i}_score' in raw.columns]
for c in as_cols:
    raw[c] = pd.to_numeric(raw[c], errors='coerce').fillna(0).astype(int)
raw['result'] = raw[as_cols].sum(axis=1) if as_cols else 0

# Target column detection
TARGET = None
for cand in ['class_asd', 'class','asd','label','target']:
    if cand in raw.columns:
        TARGET = cand
        break
assert TARGET is not None, 'Target column not found (expected class_asd or similar).'

# Map target to 0/1
if raw[TARGET].dtype == 'object':
    raw[TARGET] = raw[TARGET].map(bool_map).fillna(0).astype(int)

# One-hot encode selected categoricals; drop first to avoid collinearity
cat_cols = [c for c in ['ethnicity','country_of_res','age_desc','relation'] if c in raw.columns]
X = raw.drop(columns=[TARGET])
X = pd.get_dummies(X, columns=cat_cols, drop_first=True)

# Impute numerics with median
for c in X.columns:
    if pd.api.types.is_numeric_dtype(X[c]):
        X[c] = X[c].fillna(X[c].median())
    else:
        X[c] = X[c].fillna(0)

y = raw[TARGET].astype(int)

print('Features:', X.shape, 'Target:', y.shape)


In [None]:
# Split & scale
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=SEED, stratify=y
)

scaler = StandardScaler(with_mean=True, with_std=True)
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

X_train_s = X_train_s.astype(np.float32)
X_test_s = X_test_s.astype(np.float32)
y_train_a = y_train.values.astype(np.float32).reshape(-1, 1)
y_test_a = y_test.values.astype(np.float32).reshape(-1, 1)

X_train_s.shape, X_test_s.shape


In [None]:
# PyTorch logistic model & training
class TorchLogReg(nn.Module):
    def __init__(self, in_features: int):
        super().__init__()
        self.linear = nn.Linear(in_features, 1)
    def forward(self, x):
        return self.linear(x)

in_features = X_train_s.shape[1]
model = TorchLogReg(in_features)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Train/val split from training set
idx = np.arange(X_train_s.shape[0])
np.random.shuffle(idx)
val_ratio = 0.2
val_n = int(len(idx) * val_ratio)
val_idx, tr_idx = idx[:val_n], idx[val_n:]

X_tr = torch.tensor(X_train_s[tr_idx], dtype=torch.float32).to(device)
Y_tr = torch.tensor(y_train_a[tr_idx], dtype=torch.float32).to(device)
X_val = torch.tensor(X_train_s[val_idx], dtype=torch.float32).to(device)
Y_val = torch.tensor(y_train_a[val_idx], dtype=torch.float32).to(device)

train_loader = DataLoader(TensorDataset(X_tr, Y_tr), batch_size=64, shuffle=True)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

best_val = float('inf')
patience, bad_epochs = 5, 0
history = {'train_loss': [], 'val_loss': []}

for epoch in range(200):
    model.train()
    running = 0.0
    for xb, yb in train_loader:
        optimizer.zero_grad()
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()
        running += loss.item() * xb.size(0)
    train_loss = running / len(train_loader.dataset)

    # val
    model.eval()
    with torch.no_grad():
        val_logits = model(X_val)
        val_loss = criterion(val_logits, Y_val).item()

    history['train_loss'].append(train_loss)
    history['val_loss'].append(val_loss)

    if val_loss + 1e-6 < best_val:
        best_val = val_loss
        bad_epochs = 0
        best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
    else:
        bad_epochs += 1
        if bad_epochs >= patience:
            break

# load best
model.load_state_dict(best_state)
print('Best val loss:', best_val)


In [None]:
# Evaluate on test & visualize training loss
import matplotlib.pyplot as plt

X_te = torch.tensor(X_test_s, dtype=torch.float32).to(device)
with torch.no_grad():
    logits = model(X_te).cpu().numpy().ravel()
    probs = 1 / (1 + np.exp(-logits))

preds = (probs >= 0.5).astype(int)

acc = metrics.accuracy_score(y_test, preds)
prec = metrics.precision_score(y_test, preds, zero_division=0)
rec = metrics.recall_score(y_test, preds, zero_division=0)
f1 = metrics.f1_score(y_test, preds, zero_division=0)
roc = metrics.roc_auc_score(y_test, probs)
print({'accuracy':acc, 'precision':prec, 'recall':rec, 'f1':f1, 'roc_auc':roc})

# Training curve
plt.figure(figsize=(6,4))
plt.plot(history['train_loss'], label='train')
plt.plot(history['val_loss'], label='val')
plt.title('Training/Validation Loss')
plt.xlabel('epoch')
plt.ylabel('loss')
plt.legend()
plt.tight_layout()
plt.show()
