import numpy as np class Tensor: def __init__(self, data, _parents=(), _op=''): self.data = np.array(data) if not isinstance(data, np.ndarray) else data self._parents = _parents self._op = _op # gradient: same shape as data, init to zeros self.grad = np.zeros_like(self.data, dtype=np.float64) self._backward = lambda: None def __repr__(self) -> str: return f"Tensor(data={self.data}, grad={self.grad})" def __neg__(self) -> 'Tensor': return self * -1 def __sub__(self, other: 'Tensor') -> 'Tensor': return self + (-other) def __rsub__(self, other: 'Tensor') -> 'Tensor': return Tensor(other) - self def __rmul__(self, other: 'Tensor') -> 'Tensor': return self * other def __radd__(self, other: 'Tensor') -> 'Tensor': return self + other def _unbroadcast(self, grad, original_shape): # sum over leading dimensions that were added while len(grad.shape) > len(original_shape): grad = grad.sum(axis=0) # sum over dims that were size 1 but got broadcasted for i, (grad_dim, orig_dim) in enumerate(zip(grad.shape, original_shape)): if orig_dim == 1 and grad_dim > 1: grad = grad.sum(axis=i, keepdims=True) return grad def __add__(self, other: 'Tensor') -> 'Tensor': # handle scalar addition other = other if isinstance(other, Tensor) else Tensor(other) out = Tensor(self.data + other.data, (self, other), '+') def _backward(): # undo broadcast by summing over broadcast dims grad_self = self._unbroadcast(out.grad, self.data.shape) grad_other = other._unbroadcast(out.grad, other.data.shape) # accumulate grads self.grad += grad_self other.grad += grad_other out._backward = _backward return out def __mul__(self, other: 'Tensor') -> 'Tensor': other = other if isinstance(other, Tensor) else Tensor(other) out = Tensor(self.data * other.data, (self, other), '*') def _backward(): # local gradients with broadcasting grad_self = out.grad * other.data grad_other = out.grad * self.data grad_self = self._unbroadcast(grad_self, self.data.shape) grad_other = other._unbroadcast(grad_other, other.data.shape) # accumulate grads self.grad += grad_self other.grad += grad_other out._backward = _backward return out def __matmul__(self, other: 'Tensor') -> 'Tensor': other = other if isinstance(other, Tensor) else Tensor(other) out = Tensor(self.data @ other.data, (self, other), '@') def _backward(): self_data = self.data other_data = other.data if other_data.ndim == 1: grad_self = out.grad.reshape(-1, 1) @ other_data.reshape(1, -1) else: other_data_T = other_data.swapaxes(-2, -1) grad_self = out.grad @ other_data_T grad_self = self._unbroadcast(grad_self, self_data.shape) if self_data.ndim == 1: self_data_T = self_data.reshape(-1, 1) grad_other = self_data_T @ out.grad.reshape(1, -1) else: self_data_T = self_data.swapaxes(-2, -1) grad_other = self_data_T @ out.grad grad_other = self._unbroadcast(grad_other, other_data.shape) # accumulate grads self.grad += grad_self other.grad += grad_other out._backward = _backward return out def __pow__(self, power) -> 'Tensor': assert isinstance(power, (int, float)), "only support int/float powers" out = Tensor(self.data ** power, (self, ), f'**{power}') def _backward(): self.grad += power * (self.data ** (power - 1)) * out.grad out._backward = _backward return out def sum(self, axis=None, keepdims=False) -> 'Tensor': out = Tensor(self.data.sum(axis=axis, keepdims=keepdims), (self, ), 'sum') def _backward(): grad = out.grad if axis is not None and not keepdims: grad = np.expand_dims(grad, axis=axis) self.grad += np.broadcast_to(grad, self.data.shape) out._backward = _backward return out def mean(self, axis=None, keepdims=False): if axis is None: count = self.data.size else: count = self.data.shape[axis] out = Tensor(self.data.mean(axis=axis, keepdims=keepdims), (self, ), 'mean') def _backward(): grad = out.grad / count if axis is not None and not keepdims: grad = np.expand_dims(grad, axis=axis) self.grad += np.broadcast_to(grad, self.data.shape) out._backward = _backward return out def log(self) -> 'Tensor': out = Tensor(np.log(self.data), (self, ), 'log') def _backward(): self.grad += (1 / self.data) * out.grad out._backward = _backward return out def backward(self): # build topological order topo = [] visited = set() def build_topo(tensor): if tensor not in visited: visited.add(tensor) for parent in tensor._parents: build_topo(parent) topo.append(tensor) build_topo(self) # init gradient of output to 1 self.grad = np.ones_like(self.data, dtype=np.float64) # backprop for node in reversed(topo): node._backward() # activation functions def relu(self) -> 'Tensor': out = Tensor(np.maximum(0, self.data), (self, ), 'ReLU') def _backward(): self.grad += (self.data > 0) * out.grad out._backward = _backward return out def tanh(self) -> 'Tensor': t = np.tanh(self.data) out = Tensor(t, (self, ), 'tanh') def _backward(): self.grad += (1 - t**2) * out.grad out._backward = _backward return out def sigmoid(self) -> 'Tensor': sig = 1 / (1 + np.exp(-self.data)) out = Tensor(sig, (self, ), 'sigmoid') def _backward(): self.grad += sig * (1 - sig) * out.grad out._backward = _backward return out