nanotorch implementation

This commit is contained in:
2025-11-17 21:32:28 +05:30
parent 618423e8d2
commit a6e32fcc71
19 changed files with 809 additions and 2 deletions

0
nanotorch/__init__.py Normal file
View File

88
nanotorch/nn.py Normal file
View File

@@ -0,0 +1,88 @@
import numpy as np
from nanotorch.tensor import Tensor
class Linear:
def __init__(self, in_features, out_features):
"""
A linear layer: y = x @ W + b
Args:
in_features: input dims
out_features: output dims
"""
self.W = Tensor(np.random.randn(in_features, out_features) * 0.1)
self.b = Tensor(np.zeros(out_features))
def __call__(self, x):
"""
Forward pass: y = x @ W + b
Args:
x: input tensor, shape = (batch_size, in_features)
Returns:
output tensor, shape = (batch_size, out_features)
"""
return x @ self.W + self.b
def parameters(self):
"""
Returns:
List of trainable parameters
"""
return [self.W, self.b]
class MLP:
"""
An MLP is just stacked linear layers with activations: Input → Linear → ReLU → Linear → ReLU → Linear → Output
"""
def __init__(self, layer_sizes):
"""
MLP with ReLU activation
Args:
layer_sizes: list of layer dims [input, hidden1, hidden2, ..., output]
e.g. [2, 16, 16, 1] means:
- input: 2 features
- 2 hidden layers with 16 neurons each
- output: 1 value
"""
self.layers = []
for i in range(len(layer_sizes) - 1):
self.layers.append(Linear(layer_sizes[i], layer_sizes[i + 1]))
def __call__(self, x):
"""
Forward pass with ReLU activation between layers.
No activation on the final layer (common for regression/raw logits).
"""
for i, layer in enumerate(self.layers):
x = layer(x)
if i < len(self.layers) - 1:
x = x.relu()
return x
def parameters(self):
params = []
for layer in self.layers:
params.extend(layer.parameters())
return params
class SGD:
def __init__(self, parameters, lr=0.01):
"""
Args
parameters: list of Tensor objects to minimize
lr: learning rate
"""
self.parameters = parameters
self.lr = lr
def step(self):
for param in self.parameters:
param.data -= self.lr * param.grad
def zero_grad(self):
for param in self.parameters:
param.grad = np.zeros_like(param.data, dtype=np.float64)

189
nanotorch/tensor.py Normal file
View File

@@ -0,0 +1,189 @@
import numpy as np
class Tensor:
def __init__(self, data, _parents=(), _op=''):
self.data = np.array(data) if not isinstance(data, np.ndarray) else data
self._parents = _parents
self._op = _op
# gradient: same shape as data, init to zeros
self.grad = np.zeros_like(self.data, dtype=np.float64)
self._backward = lambda: None
def __repr__(self) -> str:
return f"Tensor(data={self.data}, grad={self.grad})"
def __neg__(self) -> 'Tensor':
return self * -1
def __sub__(self, other: 'Tensor') -> 'Tensor':
return self + (-other)
def __rsub__(self, other: 'Tensor') -> 'Tensor':
return Tensor(other) - self
def __rmul__(self, other: 'Tensor') -> 'Tensor':
return self * other
def __radd__(self, other: 'Tensor') -> 'Tensor':
return self + other
def _unbroadcast(self, grad, original_shape):
# sum over leading dimensions that were added
while len(grad.shape) > len(original_shape):
grad = grad.sum(axis=0)
# sum over dims that were size 1 but got broadcasted
for i, (grad_dim, orig_dim) in enumerate(zip(grad.shape, original_shape)):
if orig_dim == 1 and grad_dim > 1:
grad = grad.sum(axis=i, keepdims=True)
return grad
def __add__(self, other: 'Tensor') -> 'Tensor':
# handle scalar addition
other = other if isinstance(other, Tensor) else Tensor(other)
out = Tensor(self.data + other.data, (self, other), '+')
def _backward():
# undo broadcast by summing over broadcast dims
grad_self = self._unbroadcast(out.grad, self.data.shape)
grad_other = other._unbroadcast(out.grad, other.data.shape)
# accumulate grads
self.grad += grad_self
other.grad += grad_other
out._backward = _backward
return out
def __mul__(self, other: 'Tensor') -> 'Tensor':
other = other if isinstance(other, Tensor) else Tensor(other)
out = Tensor(self.data * other.data, (self, other), '*')
def _backward():
# local gradients with broadcasting
grad_self = out.grad * other.data
grad_other = out.grad * self.data
grad_self = self._unbroadcast(grad_self, self.data.shape)
grad_other = other._unbroadcast(grad_other, other.data.shape)
# accumulate grads
self.grad += grad_self
other.grad += grad_other
out._backward = _backward
return out
def __matmul__(self, other: 'Tensor') -> 'Tensor':
other = other if isinstance(other, Tensor) else Tensor(other)
out = Tensor(self.data @ other.data, (self, other), '@')
def _backward():
self_data = self.data
other_data = other.data
if other_data.ndim == 1:
grad_self = out.grad.reshape(-1, 1) @ other_data.reshape(1, -1)
else:
other_data_T = other_data.swapaxes(-2, -1)
grad_self = out.grad @ other_data_T
grad_self = self._unbroadcast(grad_self, self_data.shape)
if self_data.ndim == 1:
self_data_T = self_data.reshape(-1, 1)
grad_other = self_data_T @ out.grad.reshape(1, -1)
else:
self_data_T = self_data.swapaxes(-2, -1)
grad_other = self_data_T @ out.grad
grad_other = self._unbroadcast(grad_other, other_data.shape)
# accumulate grads
self.grad += grad_self
other.grad += grad_other
out._backward = _backward
return out
def __pow__(self, power) -> 'Tensor':
assert isinstance(power, (int, float)), "only support int/float powers"
out = Tensor(self.data ** power, (self, ), f'**{power}')
def _backward():
self.grad += power * (self.data ** (power - 1)) * out.grad
out._backward = _backward
return out
def sum(self, axis=None, keepdims=False) -> 'Tensor':
out = Tensor(self.data.sum(axis=axis, keepdims=keepdims), (self, ), 'sum')
def _backward():
grad = out.grad
if axis is not None and not keepdims:
grad = np.expand_dims(grad, axis=axis)
self.grad += np.broadcast_to(grad, self.data.shape)
out._backward = _backward
return out
def mean(self, axis=None, keepdims=False):
if axis is None:
count = self.data.size
else:
count = self.data.shape[axis]
out = Tensor(self.data.mean(axis=axis, keepdims=keepdims), (self, ), 'mean')
def _backward():
grad = out.grad / count
if axis is not None and not keepdims:
grad = np.expand_dims(grad, axis=axis)
self.grad += np.broadcast_to(grad, self.data.shape)
out._backward = _backward
return out
def backward(self):
# build topological order
topo = []
visited = set()
def build_topo(tensor):
if tensor not in visited:
visited.add(tensor)
for parent in tensor._parents:
build_topo(parent)
topo.append(tensor)
build_topo(self)
# init gradient of output to 1
self.grad = np.ones_like(self.data, dtype=np.float64)
# backprop
for node in reversed(topo):
node._backward()
# activation functions
def relu(self) -> 'Tensor':
out = Tensor(np.maximum(0, self.data), (self, ), 'ReLU')
def _backward():
self.grad += (self.data > 0) * out.grad
out._backward = _backward
return out
def tanh(self) -> 'Tensor':
t = np.tanh(self.data)
out = Tensor(t, (self, ), 'tanh')
def _backward():
self.grad += (1 - t**2) * out.grad
out._backward = _backward
return out