diff --git a/README.md b/README.md
index e19d622..af6738f 100644
--- a/README.md
+++ b/README.md
@@ -1,28 +1 @@
-
-
-
-
-Let's tidy up any small things
-
-
-
-Write a really nice README on what it does, how it does and how can one use it
-
-
-
-Make it ready for release (will push to pypi)
-
-
-
-
-
-Make a GitHub action for this actually, so I don't have to do this manually
-
-
-
-There's already a pkg on pypi named "nanotorch", so we might need to name it something else
-
-
-
-
-
+# **smoltorch**
diff --git a/examples/train_classification.py b/examples/train_classification.py
new file mode 100644
index 0000000..be5ab8e
--- /dev/null
+++ b/examples/train_classification.py
@@ -0,0 +1,88 @@
+import numpy as np
+from sklearn.datasets import load_breast_cancer
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+from nanotorch.tensor import Tensor
+from nanotorch.nn import MLP, SGD, binary_cross_entropy
+
+# Load breast cancer dataset (binary classification)
+print("Loading breast cancer dataset...")
+data = load_breast_cancer()
+X, y = data.data, data.target.reshape(-1, 1)
+
+# Normalize features (important for neural networks!)
+scaler = StandardScaler()
+X = scaler.fit_transform(X)
+
+# Split
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+
+print(f"Training samples: {X_train.shape[0]}")
+print(f"Test samples: {X_test.shape[0]}")
+print(f"Features: {X_train.shape[1]}")
+
+# Create model (note: output goes through sigmoid)
+class BinaryClassifier(MLP):
+    def __call__(self, x):
+        # Forward pass through MLP
+        x = super().__call__(x)
+        # Apply sigmoid for probabilities
+        return x.sigmoid()
+
+model = BinaryClassifier([30, 16, 8, 1])  # 30 features -> 1 output probability
+optimizer = SGD(model.parameters(), lr=0.1)
+
+# Training loop
+epochs = 200
+print("\nTraining...")
+
+for epoch in range(epochs):
+    # Convert to tensors
+    X_tensor = Tensor(X_train)
+    y_tensor = Tensor(y_train)
+    
+    # Forward pass
+    y_pred = model(X_tensor)
+    
+    # Binary cross-entropy loss
+    loss = binary_cross_entropy(y_pred, y_tensor)
+    
+    # Backward pass
+    optimizer.zero_grad()
+    loss.backward()
+
+    # After loss.backward(), before optimizer.step()
+    if (epoch + 1) % 20 == 0:
+        # Check gradient magnitudes
+        grad_norm = sum(np.sum(p.grad**2) for p in model.parameters())
+        print(f"Epoch {epoch + 1}, Loss: {loss.data:.4f}, Grad norm: {grad_norm:.6f}")
+    
+    # Update weights
+    optimizer.step()
+    
+    # Calculate accuracy
+    if (epoch + 1) % 20 == 0:
+        predictions = (y_pred.data > 0.5).astype(float)
+        accuracy = (predictions == y_train).mean()
+        print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.data:.4f}, Accuracy: {accuracy:.4f}")
+
+# Evaluate on test set
+print("\nEvaluating on test set...")
+X_test_tensor = Tensor(X_test)
+y_test_tensor = Tensor(y_test)
+
+y_pred_test = model(X_test_tensor)
+test_loss = binary_cross_entropy(y_pred_test, y_test_tensor)
+
+predictions = (y_pred_test.data > 0.5).astype(float)
+test_accuracy = (predictions == y_test).mean()
+
+print(f"Test Loss: {test_loss.data:.4f}")
+print(f"Test Accuracy: {test_accuracy:.4f}")
+
+print("\nSample predictions:")
+for i in range(5):
+    prob = y_pred_test.data[i, 0]
+    pred = "Malignant" if prob > 0.5 else "Benign"
+    true = "Malignant" if y_test[i, 0] == 1 else "Benign"
+    print(f"True: {true}, Predicted: {pred} (prob: {prob:.3f})")
\ No newline at end of file
diff --git a/nanotorch/nn.py b/nanotorch/nn.py
index b48d8d1..bb790f5 100644
--- a/nanotorch/nn.py
+++ b/nanotorch/nn.py
@@ -1,6 +1,26 @@
 import numpy as np
 from nanotorch.tensor import Tensor
 
+# helper functions
+def binary_cross_entropy(y_pred, y_true):
+  """
+  Binary cross entropy loss with numerical stability
+
+  Args:
+    y_pred: predicted probabilities, shape (batch_size, 1)
+    y_true: true labels (0 or 1), shape (batch_size, 1)
+  
+  Returns:
+    scalar loss
+  """
+  # clip preds to avoid 'log(0)'
+  epsilon = 1e-7
+
+  # bce: -[y*log(p) + (1-y)*log(1-p)]
+  term1 = y_true * y_pred.log()
+  term2 = (Tensor(1.0) - y_true) * (Tensor(1.0) - y_pred + epsilon).log()
+  return -(term1 + term2).mean()
+
 class Linear:
   def __init__(self, in_features, out_features):
     """
@@ -10,7 +30,9 @@ class Linear:
       in_features: input dims
       out_features: output dims
     """
-    self.W = Tensor(np.random.randn(in_features, out_features) * 0.1)
+    # xavier/glorot initialization
+    limit = np.sqrt(6 / (in_features + out_features))
+    self.W = Tensor(np.random.uniform(-limit, limit, (in_features, out_features)))
     self.b = Tensor(np.zeros(out_features))
   
   def __call__(self, x):
diff --git a/nanotorch/tensor.py b/nanotorch/tensor.py
index a72917b..a906403 100644
--- a/nanotorch/tensor.py
+++ b/nanotorch/tensor.py
@@ -147,6 +147,15 @@ class Tensor:
     out._backward = _backward
     return out
   
+  def log(self) -> 'Tensor':
+    out = Tensor(np.log(self.data), (self, ), 'log')
+
+    def _backward():
+      self.grad += (1 / self.data) * out.grad
+    
+    out._backward = _backward
+    return out
+  
   def backward(self):
     # build topological order
     topo = []
@@ -187,3 +196,14 @@ class Tensor:
     
     out._backward = _backward
     return out
+  
+  def sigmoid(self) -> 'Tensor':
+    sig = 1 / (1 + np.exp(-self.data))
+    out = Tensor(sig, (self, ), 'sigmoid')
+
+    def _backward():
+      self.grad += sig * (1 - sig) * out.grad
+    
+    out._backward = _backward
+    return out
+  
\ No newline at end of file