nanotorch implementation

2025-12-06 07:02:51 +00:00 · 2025-11-17 21:32:28 +05:30
parent 618423e8d2
commit a6e32fcc71
19 changed files with 809 additions and 2 deletions
--- a/nanotorch/init.py
+++ b/nanotorch/init.py
--- a/nanotorch/nn.py
+++ b/nanotorch/nn.py
@@ -0,0 +1,88 @@
+import numpy as np
+from nanotorch.tensor import Tensor
+
+class Linear:
+  def __init__(self, in_features, out_features):
+    """
+    A linear layer: y = x @ W + b
+
+    Args:
+      in_features: input dims
+      out_features: output dims
+    """
+    self.W = Tensor(np.random.randn(in_features, out_features) * 0.1)
+    self.b = Tensor(np.zeros(out_features))
+  
+  def __call__(self, x):
+    """
+    Forward pass: y = x @ W + b
+    
+    Args:
+      x: input tensor, shape = (batch_size, in_features)
+    
+    Returns:
+      output tensor, shape = (batch_size, out_features)
+    """
+    return x @ self.W + self.b
+  
+  def parameters(self):
+    """
+    Returns:
+      List of trainable parameters
+    """
+    return [self.W, self.b]
+
+class MLP:
+  """
+  An MLP is just stacked linear layers with activations: Input → Linear → ReLU → Linear → ReLU → Linear → Output
+  """
+  def __init__(self, layer_sizes):
+    """
+    MLP with ReLU activation
+
+    Args:
+      layer_sizes: list of layer dims [input, hidden1, hidden2, ..., output]
+        e.g. [2, 16, 16, 1] means:
+          - input: 2 features
+          - 2 hidden layers with 16 neurons each
+          - output: 1 value
+    """
+    self.layers = []
+    for i in range(len(layer_sizes) - 1):
+      self.layers.append(Linear(layer_sizes[i], layer_sizes[i + 1]))
+  
+  def __call__(self, x):
+    """
+    Forward pass with ReLU activation between layers.
+    No activation on the final layer (common for regression/raw logits).
+    """
+    for i, layer in enumerate(self.layers):
+      x = layer(x)
+      if i < len(self.layers) - 1:
+        x = x.relu()
+    
+    return x
+  
+  def parameters(self):
+    params = []
+    for layer in self.layers:
+      params.extend(layer.parameters())
+    return params
+
+class SGD:
+  def __init__(self, parameters, lr=0.01):
+    """
+    Args
+      parameters: list of Tensor objects to minimize
+      lr: learning rate
+    """
+    self.parameters = parameters
+    self.lr = lr
+  
+  def step(self):
+    for param in self.parameters:
+      param.data -= self.lr * param.grad
+  
+  def zero_grad(self):
+    for param in self.parameters:
+      param.grad = np.zeros_like(param.data, dtype=np.float64)
--- a/nanotorch/tensor.py
+++ b/nanotorch/tensor.py
@@ -0,0 +1,189 @@
+import numpy as np
+
+class Tensor:
+  def __init__(self, data, _parents=(), _op=''):
+    self.data = np.array(data) if not isinstance(data, np.ndarray) else data
+    self._parents = _parents
+    self._op = _op
+
+    # gradient: same shape as data, init to zeros
+    self.grad = np.zeros_like(self.data, dtype=np.float64)
+    self._backward = lambda: None
+  
+  def __repr__(self) -> str:
+    return f"Tensor(data={self.data}, grad={self.grad})"
+  
+  def __neg__(self) -> 'Tensor':
+    return self * -1
+  
+  def __sub__(self, other: 'Tensor') -> 'Tensor':
+    return self + (-other)
+  
+  def __rsub__(self, other: 'Tensor') -> 'Tensor':
+    return Tensor(other) - self
+  
+  def __rmul__(self, other: 'Tensor') -> 'Tensor':
+    return self * other
+
+  def __radd__(self, other: 'Tensor') -> 'Tensor':
+    return self + other
+  
+  def _unbroadcast(self, grad, original_shape):
+    # sum over leading dimensions that were added
+    while len(grad.shape) > len(original_shape):
+      grad = grad.sum(axis=0)
+    
+    # sum over dims that were size 1 but got broadcasted
+    for i, (grad_dim, orig_dim) in enumerate(zip(grad.shape, original_shape)):
+      if orig_dim == 1 and grad_dim > 1:
+        grad = grad.sum(axis=i, keepdims=True)
+    
+    return grad
+
+  def __add__(self, other: 'Tensor') -> 'Tensor':
+    # handle scalar addition
+    other = other if isinstance(other, Tensor) else Tensor(other)
+    out = Tensor(self.data + other.data, (self, other), '+')
+
+    def _backward():
+      # undo broadcast by summing over broadcast dims
+      grad_self = self._unbroadcast(out.grad, self.data.shape)
+      grad_other = other._unbroadcast(out.grad, other.data.shape)
+
+      # accumulate grads
+      self.grad += grad_self
+      other.grad += grad_other
+    
+    out._backward = _backward
+    return out
+  
+  def __mul__(self, other: 'Tensor') -> 'Tensor':
+    other = other if isinstance(other, Tensor) else Tensor(other)
+    out = Tensor(self.data * other.data, (self, other), '*')
+
+    def _backward():
+      # local gradients with broadcasting
+      grad_self = out.grad * other.data
+      grad_other = out.grad * self.data
+      grad_self = self._unbroadcast(grad_self, self.data.shape)
+      grad_other = other._unbroadcast(grad_other, other.data.shape)
+
+      # accumulate grads
+      self.grad += grad_self
+      other.grad += grad_other
+    
+    out._backward = _backward
+    return out
+  
+  def __matmul__(self, other: 'Tensor') -> 'Tensor':
+    other = other if isinstance(other, Tensor) else Tensor(other)
+    out = Tensor(self.data @ other.data, (self, other), '@')
+
+    def _backward():
+      self_data = self.data
+      other_data = other.data
+
+      if other_data.ndim == 1:
+        grad_self = out.grad.reshape(-1, 1) @ other_data.reshape(1, -1)
+      else:
+        other_data_T = other_data.swapaxes(-2, -1)
+        grad_self = out.grad @ other_data_T
+      grad_self = self._unbroadcast(grad_self, self_data.shape)
+
+      if self_data.ndim == 1:
+        self_data_T = self_data.reshape(-1, 1)
+        grad_other = self_data_T @ out.grad.reshape(1, -1)
+      else:
+        self_data_T = self_data.swapaxes(-2, -1)
+        grad_other = self_data_T @ out.grad
+      grad_other = self._unbroadcast(grad_other, other_data.shape)
+
+      # accumulate grads
+      self.grad += grad_self
+      other.grad += grad_other
+    
+    out._backward = _backward
+    return out
+  
+  def __pow__(self, power) -> 'Tensor':
+    assert isinstance(power, (int, float)), "only support int/float powers"
+
+    out = Tensor(self.data ** power, (self, ), f'**{power}')
+
+    def _backward():
+      self.grad += power * (self.data ** (power - 1)) * out.grad
+    
+    out._backward = _backward
+    return out
+  
+  def sum(self, axis=None, keepdims=False) -> 'Tensor':
+    out = Tensor(self.data.sum(axis=axis, keepdims=keepdims), (self, ), 'sum')
+
+    def _backward():
+      grad = out.grad
+      if axis is not None and not keepdims:
+        grad = np.expand_dims(grad, axis=axis)
+      
+      self.grad += np.broadcast_to(grad, self.data.shape)
+    
+    out._backward = _backward
+    return out
+  
+  def mean(self, axis=None, keepdims=False):
+    if axis is None:
+      count = self.data.size
+    else:
+      count = self.data.shape[axis]
+    
+    out = Tensor(self.data.mean(axis=axis, keepdims=keepdims), (self, ), 'mean')
+
+    def _backward():
+      grad = out.grad / count
+      if axis is not None and not keepdims:
+        grad = np.expand_dims(grad, axis=axis)
+      
+      self.grad += np.broadcast_to(grad, self.data.shape)
+    
+    out._backward = _backward
+    return out
+  
+  def backward(self):
+    # build topological order
+    topo = []
+    visited = set()
+
+    def build_topo(tensor):
+      if tensor not in visited:
+        visited.add(tensor)
+        for parent in tensor._parents:
+          build_topo(parent)
+        topo.append(tensor)
+    
+    build_topo(self)
+
+    # init gradient of output to 1
+    self.grad = np.ones_like(self.data, dtype=np.float64)
+
+    # backprop
+    for node in reversed(topo):
+      node._backward()
+  
+  # activation functions
+  def relu(self) -> 'Tensor':
+    out = Tensor(np.maximum(0, self.data), (self, ), 'ReLU')
+
+    def _backward():
+      self.grad += (self.data > 0) * out.grad
+    
+    out._backward = _backward
+    return out
+  
+  def tanh(self) -> 'Tensor':
+    t = np.tanh(self.data)
+    out = Tensor(t, (self, ), 'tanh')
+
+    def _backward():
+      self.grad += (1 - t**2) * out.grad
+    
+    out._backward = _backward
+    return out