# Common imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# Matplotlib Animation
from matplotlib.animation import FuncAnimation
from matplotlib import animation
from matplotlib import rc
rc('animation', html='jshtml')

# Interacive input plots
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

plt.style.use("ggplot")

sns.set_style('darkgrid') # darkgrid, white grid, dark, white and ticks
sns.set_palette('deep', 8, color_codes = True)

plt.rc('axes', titlesize=18)     # fontsize of the axes title
plt.rc('axes', labelsize=14)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=13)    # fontsize of the tick labels
plt.rc('ytick', labelsize=13)    # fontsize of the tick labels
plt.rc('legend', fontsize=13)    # legend fontsize
plt.rc('font', size=13)          # controls default text sizes

Lasso Regression

LASSO := Least absolute shrinkage and selection operator

  • Least absolute $\rightarrow$ that we will be applying $L_{1}$ penalty.
  • Shrinkage $\rightarrow$ that we are shrinking the $L_{1}$ norm of $\theta$.
  • Selection $\rightarrow$ the selected features would be sparse. A lot of $\theta_{i}$ will be zero. (Inference would be faster)

Popular as it would lead to a sparse solution.

Benefits of Sparse $\theta$

  • Inference is faster. (Time complexity of Inference in Linear Regression is $O(N \times M)$, where N = Number of samples, M = Number of features)
  • Redundant features can be eliminated in the process. Inherently help in interpretability.

$$\theta_{opt} = \arg_{\theta} \min (Y - X\theta)^{T}(Y - X\theta) : \mathopen|| \theta \mathclose||_{1} < S$$ Using the KKT condition, we get $$\theta_{opt} = \arg_{\theta} \min (Y - X\theta)^{T}(Y - X\theta) + \delta^{2} \mathopen|| \theta \mathclose||_{1}$$ The above function is a convex function. (sum of two convex functions)

But the $\mathopen|| \theta \mathclose||_{1}$ is not differentiable

  • It is not differentiable at $\theta$ = 0
  • Sub-gradient is used when we can't compute gradient for such cases.

  • So we can use a generalised gradient descent for sub-gradients here.

  • Also, we will explore coordinate descent algorithm.

np.random.seed(0)
N = 30
x = np.random.random(size = N)
y = 4 * x + 7 + np.random.normal(scale = 0.2, size = N)

fig, ax = plt.subplots(figsize = (10, 5), tight_layout = True)
sns.scatterplot(x = x, y = y, ax = ax, label = "Data Points", color = "b")
sns.lineplot(x = x, y = 4 * x + 7, label = "True Fit", color = "r")

ax.set(title = "4x + 7", xlabel = "x", ylabel = "y")
plt.show()

Y = y.reshape(-1, 1)
X = np.ones((N, 2))
X[:, 1] = x

t0_range = np.concatenate([np.linspace(-2, 7, num=20), np.linspace(7, 10, num=100)])
t1_range = np.concatenate(
    [np.linspace(-2, 4, num=100), np.linspace(4, 8, num=20), np.linspace(8, 10, num=20)]
)
T0, T1 = np.meshgrid(t0_range, t1_range)

Z = []
for t1 in t1_range:
    for t0 in t0_range:
        theta = np.array([t0, t1]).reshape(-1, 1)
        z = (Y - X @ theta).T @ (Y - X @ theta)
        Z += [z]
Z = np.array(Z).reshape(T0.shape)
optimum_theta_idx = np.unravel_index(np.argmin(Z, axis=None), Z.shape)

theta0_norms = []
theta1_norms = []


def plot_lasso_plot(update_idx):
    for each_ax in ax:
        each_ax.cla()
    mu = mu_values[update_idx]
    Z_constrained = []
    for t1_idx, t1 in enumerate(t1_range):
        for t0_idx, t0 in enumerate(t0_range):
            theta = np.array([t0, t1]).reshape(-1, 1)
            z_cons = Z[t1_idx][t0_idx] + mu * np.linalg.norm(theta.reshape(-1), ord=1)
            Z_constrained += [z_cons]

    Z_constrained = np.array(Z_constrained).reshape(T0.shape)
    optimum_theta_cons_idx = np.unravel_index(
        np.argmin(Z_constrained, axis=None), Z.shape
    )

    ax[1].contourf(T0, T1, Z, levels=50, alpha=0.7)
    ax[1].scatter(
        T0[optimum_theta_idx],
        T1[optimum_theta_idx],
        marker="*",
        s=200,
        color="y",
        label="unconstrained optimum",
    )
    ax[1].scatter(
        T0[optimum_theta_cons_idx],
        T1[optimum_theta_cons_idx],
        marker="*",
        s=200,
        color="b",
        label="lasso regression",
    )

    optimum_theta_cons = np.array(
        [T0[optimum_theta_cons_idx], T1[optimum_theta_cons_idx]]
    ).reshape(-1, 1)
    global theta0_norms, theta1_norms
    theta0_norms.append(T0[optimum_theta_cons_idx])
    theta1_norms.append(T1[optimum_theta_cons_idx])
    lasso_theta_norm = np.sum(np.abs(optimum_theta_cons.reshape(-1)))
    ax[1].fill(
        [lasso_theta_norm, 0, -lasso_theta_norm, 0],
        [0, lasso_theta_norm, 0, -lasso_theta_norm],
        "g",
        alpha=0.5,
        label=rf"$\theta \; L_{1}$ norm constraint $\leq$ {lasso_theta_norm:.4f}",
    )

    sns.scatterplot(x=x, y=y, ax=ax[0], label="Data Points", color="b")
    sns.lineplot(
        x=x,
        y=4 * x + 7,
        ax=ax[0],
        label="True Fit",
        color="r",
        linestyle="-.",
        lw = 3,
    )
    sns.lineplot(
        x=X[:, 1],
        y=(X @ optimum_theta_cons).reshape(-1),
        ax=ax[0],
        label="Lasso Fit",
        color="g",
        lw = 3,
    )
    ax[1].legend()
    ax[0].legend()
    ax[1].set(
        title="Least Square Objective function",
        xlabel=r"$\theta_{0}$",
        ylabel=r"$\theta_{1}$",
    )
    ax[0].set(
        title=rf"Line Fit $\mu$ = {mu:.3f} :: $\theta$ = {optimum_theta_cons.reshape(-1)[0]:.3f}, {optimum_theta_cons.reshape(-1)[1]:.3f}",
        xlabel=r"x",
        ylabel=r"y",
    )
    ax[1].set_xlim(-2, 10)
    ax[1].set_ylim(-2, 10)
    sns.lineplot(
        x=mu_values[: update_idx + 1],
        y=theta0_norms[:update_idx + 1],
        ax=ax[2],
        label=r"$\theta_{0}$",
        lw = 3,
    )
    sns.lineplot(
        x=mu_values[: update_idx + 1],
        y=theta1_norms[:update_idx + 1],
        ax=ax[2],
        label=r"$\theta_{1}$",
        lw = 3,
    )
    sns.lineplot(
        x=mu_values[: update_idx + 1],
        y=np.abs(np.array(theta0_norms[:update_idx + 1])) + np.abs(np.array(theta1_norms[:update_idx + 1])),
        ax=ax[2],
        label=r"$L_{1}$ norm $\theta$",
        lw = 3,
    )
    ax[2].set(
        title=r"Regularisation path of $\theta_{i}$",
        xlabel=r"$\mu$",
        ylabel="Coefficient Values",
    )
    ax[2].set_xlim(np.min(mu_values), np.max(mu_values))
    ax[2].set_ylim(0, 12)


fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(26, 8), tight_layout=True)
ax = ax.ravel()
mu_values = np.linspace(0, 100, num=16)
anim = FuncAnimation(
    fig, func=plot_lasso_plot, frames=len(mu_values) - 1, interval=1000 / 3
)
plt.close()
anim
</input>

mywriter = animation.PillowWriter(fps=3)
anim.save('./assets/2022-02-25-machine-learning-quiz3/lasso-contour.gif',writer=mywriter)

Why Lasso gives sparsity

Geometric Interpretation:

The contour of L1 norm looks more sharper (pointed to burst the contour plot) and is more likely to intersect the contour plots of Least Squares at one of the axis.

For even higher sparsity, we should pick $L_{p} :\; p < 1$ norm as it would be more pointed (like a needle at the axis). But $L_{p}$ is mathematically harder to solve.

Gradient Descent Interpretation

For ridge regression, the gradient depends on the value of $\theta$, so it goes near to value $0$ but never reaches true $0$. But in lasso regression, the gradient of $\mathopen| \theta \mathclose|$ is independent of value of $\theta$ and depends of sign only. So it can truly reach zero in less number of iterations.

  • LASSO inherently does feature selection.
  • Sets coefficient of less important features to zero.
  • Sparse, efficient and often more interpretable models.

Coordinate Descent

Another optimization method. Optimization along single coordinate axis in a step.

We can use it for LASSO regression than sub-gradient generalisation of gradient descent.

Idea: Sometimes it is difficult to minimize for all coordinates together (finding the gradient direction along all dimensions) in one step

Coordinate Descent for Unregularised Regression

$$ \sum \epsilon_{i}^{2} = \sum_{i} (y_{i} - (\theta_{0} \times x_{i, 0} + \theta_{1} \times x_{i, 1} + \ldots + \theta_{d} \times x_{i, d}))^{2}$$

$$ \frac{\partial ( \sum \epsilon_{i}^{2})}{\partial \theta_{j}} = 2 \times \sum_{i} x_{i, j} \times (y_{i} - (\theta_{0} \times x_{i, 0} + \theta_{1} \times x_{i, 1} + \ldots + \theta_{d} \times x_{i, d}))$$

Setting it to zero, $$\theta_{j} = \frac{\rho_{j} }{z_{j}} $$ where, $$ z_{j} = \sum_{i} x_{i, j}^{2}$$ $$ \rho_{j} = \sum_{i} (y_{i} - \hat{y_{i}}^{(-j)})$$ correlation of residual with $j^{th}$ column of $X$ where, $$ \hat{y_{i}}^{(-j)} = y_{i} - ((\theta_{0} \times x_{i, 0} + \theta_{1} \times x_{i, 1} + \ldots + \theta_{d} \times x_{i, d})) - \theta_{j} \times x_{i, j}) $$ the prediction for $i^{th}$ sample without considering $j^{th}$ feature

Coordinate Descent for Lasso Regression

LASSO objective

$$ Obj = \sum_{i} \epsilon_{i}^{2} + \delta^{2} \sum_{i} \mathopen||\theta_{i}\mathclose|| $$

$$ \frac{\partial Obj}{\partial \theta_{j}} = -2\rho_{j} + 2 z_{j}\theta_{j} + \delta^{2} \frac{\partial \mathopen||\theta_{j}\mathclose||}{\partial \theta_{j}}$$

Setting it to zero,

Case 1: If $\rho_{j} > \delta^{2} / 2$ ; $\theta_{j} > 0$ $$\theta_{j} = \frac{\rho_{j} - \delta^{2}}{z_{j}} $$ Case 2: If $\rho_{j} < -\delta^{2} / 2$ ; $\theta_{j} < 0$ $$\theta_{j} = \frac{\rho_{j} + \delta^{2}}{z_{j}} $$ Case 3: If $-\delta^{2} / 2 \leq \rho_{j} \leq \delta^{2} / 2$ $$\theta_{j} = 0 $$

The $\delta^{2} / 2$ is a soft-threshold and also does shrinkage in $||\theta_{j}||$.

Bayesian Machine Learning