Students are required to hand in the exercises and questions below in the form of a SWAN notebook or html or pdf file (with code snippets) by 31 May. Please send the material to Anna Sfyrla (anna.sfyrla@cern.ch) and Stefan Gadatsch (stefan.gadatsch@cern.ch).
We will implement a linear regression with one variable to predict a target.
# Do some preliminary setup that we don't need to repeat
import numpy as np
import pandas as pd
from sklearn import datasets
import matplotlib.pyplot as plt
%matplotlib inline
We will gemerate a dataset on the fly and examine it.
X, y, coef = datasets.make_regression(n_samples=100,
n_features=1,
n_informative=1,
n_targets=1,
bias=0.0,
effective_rank=None,
tail_strength=0.5,
noise=40.0,
shuffle=True,
coef=True,
random_state=0)
Lets convert it to pandas, embed the column names and add the target.
df = pd.DataFrame(X)
df.columns = ['X']
df.loc[:, 'y'] = y
df.head(10)
df.describe()
Let's plot it to get a better idea of what the data looks like.
fig, ax = plt.subplots(figsize=(8,8))
_ = ax.scatter(df.X, df.y, label='data')
_ = ax.set_xlabel('X')
_ = ax.set_ylabel('y')
The objective of linear regression is to minimize the cost function $$J(\theta) = \frac{1}{2m} \sum_{i=1}^m \left( h_\theta(x^{(i)} - y^{(i)})^2 \right)$$ where the hypothesis $h_\theta(x)$ is given by the linear model $$h_\theta(x) = \theta^T x = \theta_0 + \theta_1 x_1,$$ i.e. find the values of $\theta_j$.
def cost(X, y, theta):
## add here your definition of cost
A common way to minimize the cost function is batch gradient descent, i.e. performing the update $$\theta_j := \theta_j - \alpha \frac{1}{m} \sum_{i=1}^m \left( h_\theta(x^{(i)} - y^{(i)} \right) x_j^{(i)}$$ for each step of the minimization. We also add a column of ones to the training set for using vectorized computations of the cost and gradients.
# insert ones
df.insert(0, 'Ones', 1)
# get training data X and target y
cols = df.shape[1]
X = np.matrix(df.iloc[:,0:cols-1].values)
y = np.matrix(df.iloc[:,cols-1:cols].values)
theta = np.matrix(np.array([1,1]))
Compute the cost for the initial values.
cost(X, y, theta)
Complete the implementation of the minimization function below.
def fit(X, y, theta, lr=0.01, epochs=1000):
temp = np.matrix(np.zeros(theta.shape))
parameters = int(theta.ravel().shape[1])
history = np.zeros(epochs)
for i in xrange(epochs):
error = (X * theta.T) - y
for j in range(parameters):
term = np.multiply(error, X[:,j])
temp[0,j] = theta[0,j] - ((lr / len(X)) * np.sum(term))
theta = temp
history[i] = cost(X, y, theta)
return theta, history
Run gradient descent to fit the model parameters theta to the training set.
g, history = fit(X, y, theta)
print(g)
Plot the linear model along with the data and visualize the loss development.
fig, ax = plt.subplots(figsize=(8,8))
_ = ax.scatter(df.X, df.y, label='data')
_ = ax.set_xlabel('X')
_ = ax.set_ylabel('y')
x = np.linspace(df.X.min(), df.X.max(), 100)
f = g[0, 0] + (g[0, 1] * x)
ax.plot(x, f, 'r', label='Prediction')
fig, ax = plt.subplots(figsize=(8,8))
_ = ax.plot(np.arange(len(history)), history, 'r')
_ = ax.set_xlabel('Epochs')
_ = ax.set_ylabel('Cost')
Use LinearRegression
from sklearn
to perform the same task as before.
from sklearn import linear_model
model = linear_model.LinearRegression()
model.fit(X, y)
Visualize the results.
x = np.array(X[:, 1].A1)
f = model.predict(X).flatten()
fig, ax = plt.subplots(figsize=(8,8))
_ = ax.plot(x, f, 'r', label='Prediction')
_ = ax.scatter(df.X, df.y, label='Traning Data')
_ = ax.set_xlabel('X')
_ = ax.set_ylabel('y')
np.random.seed(0)
X, y, coef = datasets.make_regression(n_samples=1000,
n_features=2,
n_informative=2,
n_targets=1,
bias=0.0,
effective_rank=None,
tail_strength=0.5,
noise=40.0,
shuffle=True,
coef=True,
random_state=0)
X[:,1] *= 10000. * np.random.random_sample()
df = pd.DataFrame(X)
df.columns = ['X1', 'X2']
df.loc[:, 'y'] = y
df.head(5)
df.describe()
The task of this exercise is to build classification model based on two features. Let's get a dummy dataset.
X, y = datasets.make_classification(n_samples=100, n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1)
rng = np.random.RandomState(2)
X += 2 * rng.uniform(size=X.shape)
df = pd.DataFrame(X)
df.columns = ['X1', 'X2']
df.loc[:, 'y'] = y
Visualize the dataset and color code the classes.
positive = df[df['y'].isin([1])]
negative = df[df['y'].isin([0])]
fig, ax = plt.subplots(figsize=(8,8))
ax.scatter(positive['X1'], positive['X2'], s=50, c='b', marker='o', label='True')
ax.scatter(negative['X1'], negative['X2'], s=50, c='r', marker='x', label='False')
ax.legend()
ax.set_xlabel('X1')
ax.set_ylabel('X2')
It looks like there is a clear decision boundary between the two classes. Now we need to implement logistic regression so we can train a model to predict the outcome. The logistic regression hypothesis is defined as $$h_\theta(x) = g(\theta^T x),$$ where $g$ is the sigmoid function defined as $$g(z) = \frac{1}{1 + e^{-z}}.$$
def sigmoid(z):
### Define sigmoid here
nums = np.arange(-10, 10, step=0.1)
fig, ax = plt.subplots(figsize=(8,8))
ax.plot(nums, sigmoid(nums), 'r')
The cost function in logistic regression is $$J(\theta) = \frac{1}{m} \sum_{i=1}^m \left[ -y^{(i)} \log\left( h_\theta(x^{(i)})\right) - \left( 1 - y^{(i)} \right) \log\left( 1 - h_\theta(x^{(i)}) \right) \right],$$ and the gradient of the vost is a vector of the same length as $\theta$ where the $j^\text{th}$ element ($j = 0, 1, \ldots, n$) is defined as $$\frac{\partial J (\theta)}{\partial \theta_j} = \frac{1}{m} \sum_{i=1}^m \left( h_\theta(x^{(i)}) - y^{(i)} \right) x_j^{(i)}.$$
def cost(theta, X, y):
### define the new cost function here
Next, we convert the training data to numpy arrays.
# add a ones column - this makes the matrix multiplication work out easier
df.insert(0, 'Ones', 1)
# set X (training data) and y (target variable)
cols = df.shape[1]
X = df.iloc[:,0:cols-1]
y = df.iloc[:,cols-1:cols]
# convert to numpy arrays and initalize the parameter array theta
X = np.array(X.values)
y = np.array(y.values)
theta = np.zeros(3)
Compute the initial cost.
cost(theta, X, y)
Implement the function gradient(theta, X, y)
that computes and returns the gradients. It should not perform the gradient descent. This will later be done by a standard minimization function, similar to minuit
.
def gradient(theta, X, y):
theta = np.matrix(theta)
X = np.matrix(X)
y = np.matrix(y)
parameters = int(theta.ravel().shape[1])
grad = np.zeros(parameters)
error = sigmoid(X * theta.T) - y
for i in range(parameters):
term = np.multiply(error, X[:,i])
grad[i] = np.sum(term) / len(X)
return grad
Let's look at a single call to the gradient method using our data and initial paramter values of $\theta$.
gradient(theta, X, y)
Use SciPy's truncated newton (TNC) implementation to find the optimal parameters.
import scipy.optimize as opt
result = opt.fmin_tnc(func=cost, x0=theta, fprime=gradient, args=(X, y))
result
Let's see what the our cost looks like with this solution.
cost(result[0], X, y)
def predict(theta, X):
### define the function here
Evaluate the accuracy of the classifier.
theta_min = np.matrix(result[0])
predictions = predict(theta_min, X)
correct = [1 if ((a == 1 and b == 1) or (a == 0 and b == 0)) else 0 for (a, b) in zip(predictions, y)]
accuracy = (sum(map(int, correct)) % len(correct))
print('accuracy = {0}%'.format(accuracy))
Plot the decision boundary. Assign a color to each point in the mesh [x_min, x_max]x[y_min, y_max]
.
h = 0.02 # step size in the mesh
x_min, x_max = X[:, 1].min() - .5, X[:, 1].max() + .5
y_min, y_max = X[:, 2].min() - .5, X[:, 2].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
theta_min = np.matrix(result[0])
X_test = np.c_[xx.ravel(), yy.ravel()]
one = np.ones((X_test.shape[0],1))
X_test = np.hstack((one, X_test))
Z = np.array(predict(theta_min, X_test))
positive = df[df['y'].isin([1])]
negative = df[df['y'].isin([0])]
fig, ax = plt.subplots(figsize=(8,8))
cm = plt.cm.RdBu
Z = Z.reshape(xx.shape)
ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)
ax.scatter(positive['X1'], positive['X2'], s=50, c='b', marker='o', label='True')
ax.scatter(negative['X1'], negative['X2'], s=50, c='r', marker='x', label='False')
ax.legend()
ax.set_xlabel('X1')
ax.set_ylabel('X2')
Keep in mind that this is training set accuracy though. We didn't keep a hold-out set or use cross-validation to get a true approximation of the accuracy so this number is likely higher than its true perfomance (this topic is covered in a later exercise).
For this exercise we use a different dataset.
X, y = datasets.make_circles(n_samples=100, noise=0.2, factor=0.5, random_state=1)
df = pd.DataFrame(X)
df.columns = ['X1', 'X2']
df.loc[:, 'y'] = y
df.head(5)
Visualize it as before.
positive = df[df['y'].isin([1])]
negative = df[df['y'].isin([0])]
fig, ax = plt.subplots(figsize=(8,8))
ax.scatter(positive['X1'], positive['X2'], s=50, c='b', marker='o', label='True')
ax.scatter(negative['X1'], negative['X2'], s=50, c='r', marker='x', label='False')
ax.legend()
ax.set_xlabel('X1')
ax.set_ylabel('X2')
# add a ones column - this makes the matrix multiplication work out easier
df.insert(0, 'Ones', 1)
# set X (training data) and y (target variable)
cols = df.shape[1]
X = df.iloc[:,0:cols-1]
y = df.iloc[:,cols-1:cols]
# convert to numpy arrays and initalize the parameter array theta
X = np.array(X.values)
y = np.array(y.values)
theta = np.zeros(3)
result = opt.fmin_tnc(func=cost, x0=theta, fprime=gradient, args=(X, y))
result
theta_min = np.matrix(result[0])
predictions = predict(theta_min, X)
correct = [1 if ((a == 1 and b == 1) or (a == 0 and b == 0)) else 0 for (a, b) in zip(predictions, y)]
accuracy = (sum(map(int, correct)) % len(correct))
print('accuracy = {0}%'.format(accuracy))
h = 0.02 # step size in the mesh
x_min, x_max = X[:, 1].min() - .5, X[:, 1].max() + .5
y_min, y_max = X[:, 2].min() - .5, X[:, 2].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
theta_min = np.matrix(result[0])
X_test = np.c_[xx.ravel(), yy.ravel()]
one = np.ones((X_test.shape[0],1))
X_test = np.hstack((one, X_test))
Z = np.array(predict(theta_min, X_test))
positive = df[df['y'].isin([1])]
negative = df[df['y'].isin([0])]
fig, ax = plt.subplots(figsize=(8,8))
cm = plt.cm.RdBu
Z = Z.reshape(xx.shape)
ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)
ax.scatter(positive['X1'], positive['X2'], s=50, c='b', marker='o', label='True')
ax.scatter(negative['X1'], negative['X2'], s=50, c='r', marker='x', label='False')
ax.legend()
ax.set_xlabel('X1')
ax.set_ylabel('X2')
To fit the data better, one possibility is creating more features derived from polynomials of the original features. Create features Fi
up to degree 5.
degree = 5
x1 = df['X1']
x2 = df['X2']
for i in range(1, degree):
for j in range(0, i):
df['F' + str(i) + str(j)] = np.power(x1, i-j) * np.power(x2, j)
df.drop('X1', axis=1, inplace=True)
df.drop('X2', axis=1, inplace=True)
df = df.iloc[:, [1, 0] + range(2, df.shape[1])]
df.head(5)
Next, include the regularisation term
$$\frac{\lambda}{2m} \sum_{j=0}^n \theta^2_j$$
in the cost function from before. Write the function cost(theta, X, y, lr)
where lr
represents the learning rate.
def cost(theta, X, y, learningRate):
theta = np.matrix(theta)
X = np.matrix(X)
y = np.matrix(y)
first = np.multiply(-y, np.log(sigmoid(X * theta.T)))
second = np.multiply((1 - y), np.log(1 - sigmoid(X * theta.T)))
reg = (lr / 2 * len(X)) * np.sum(np.power(theta[:,1:theta.shape[1]], 2))
return np.sum(first - second) / (len(X)) + reg
Now add the regularization to the gradient function, gradientReg(theta, X, y, lr)
, i.e. implement
\begin{align*}
\frac{\partial J (\theta)}{\partial \theta_j} &= \frac{1}{m} \sum_{i=1}^m \left( h_\theta(x^{(i)}) - y^{(i)} \right) x_j^{(i)} \quad &\text{for} \quad j = 0 \\
\frac{\partial J (\theta)}{\partial \theta_j} &= \left(\frac{1}{m} \sum_{i=1}^m \left( h_\theta(x^{(i)}) - y^{(i)} \right) x_j^{(i)}\right) + \frac{\lambda}{m} \theta_j \quad &\text{for} \quad j \ge 1 \\
\end{align*}
def gradient(theta, X, y, lr):
theta = np.matrix(theta)
X = np.matrix(X)
y = np.matrix(y)
parameters = int(theta.ravel().shape[1])
grad = np.zeros(parameters)
error = sigmoid(X * theta.T) - y
for i in range(parameters):
term = np.multiply(error, X[:,i])
if (i == 0):
grad[i] = np.sum(term) / len(X)
else:
grad[i] = (np.sum(term) / len(X)) + ((lr / len(X)) * theta[:,i])
return grad
# set X and y (remember from above that we moved the label to column 0)
cols = df.shape[1]
X2 = df.iloc[:,1:cols]
y2 = df.iloc[:,0:1]
# convert to numpy arrays and initalize the parameter array theta
X2 = np.array(X2.values)
y2 = np.array(y2.values)
theta2 = np.zeros(11)
Let's initialize the learning rate to a sensible value. We can play with this later if necessary (i.e. if the penalization is too strong or not strong enough).
lr = 1
Now let's try calling our new regularized functions with the default (0) values for theta to make sure the calculations are working.
cost(theta2, X2, y2, lr)
gradient(theta2, X2, y2, lr)
Now we can use the same optimization function from the previous exercise to compute the optimal solution.
result2 = opt.fmin_tnc(func=cost, x0=theta2, fprime=gradient, args=(X2, y2, lr))
result2
Check the accuracy of the solution and visualize the decision boundary.
theta_min = np.matrix(result2[0])
predictions = predict(theta_min, X2)
correct = [1 if ((a == 1 and b == 1) or (a == 0 and b == 0)) else 0 for (a, b) in zip(predictions, y2)]
accuracy = (sum(map(int, correct)) % len(correct))
print 'accuracy = {0}%'.format(accuracy)
Although we implemented these algorithms from scratch, it's worth noting that we could also use a high-level python library like scikit-learn to solve this problem.
model = linear_model.LogisticRegression(penalty='l2', C=1.0)
model.fit(X2, y2.ravel())
model.score(X2, y2)
X = np.arange(1, 11).reshape(10, 1)
y = np.array([7, 8, 7, 13, 16, 15, 19, 23, 18, 21]).reshape(10, 1)
fig, ax = plt.subplots(figsize=(8,8))
_ = ax.scatter(X, y, label='data')
_ = ax.set_xlabel('X')
_ = ax.set_ylabel('y')
Perform a linear regression using sklearn
.
model = linear_model.LinearRegression()
model.fit(X, y)
print(model.coef_, model.intercept_)
print(model.score(X, y))
Visualise the result.
a = model.coef_ * X + model.intercept_
fig, ax = plt.subplots(figsize=(8,8))
ax.plot(X, y, 'o', X, a)
ax.set_ylim([0, 30])
ax.set_xlabel('X')
ax.set_ylabel('y')
X = np.c_[X, X**2]
X
Train the model again. Mathematically the parametrisation is $a = \theta_0 + \theta_1 x + \theta_2 x^2$.
model.fit(X, y)
x = np.arange(1, 11, 1)
x = np.c_[x, x**2]
a = np.dot(X, model.coef_.transpose()) + model.intercept_
model.score(X, y)
fig, ax = plt.subplots(figsize=(8,8))
ax.plot(X[:, 0], y, 'ro', x[:, 0], a)
ax.set_ylim([0, 30])
ax.set_xlabel('X')
ax.set_ylabel('y')
X = np.arange(1, 16)
y = np.append(y, [24, 23, 22, 26, 22])
fig, ax = plt.subplots(figsize=(8,8))
_ = ax.scatter(X, y, label='data')
_ = ax.set_xlabel('X')
_ = ax.set_ylabel('y')
E.g. here is what is happening with the linear model:
X = np.arange(1, 16).reshape(15, 1)
model.fit(X[:10], y[:10])
print(model.score(X[10:], y[10:]))
a = np.dot(X, model.coef_.transpose()) + model.intercept_
plt.plot(X, y, 'ro', X, a)
plt.show()