ai.svm.classes

  1# Copyright 2023 The AI Authors. All Rights Reserved.
  2#
  3# Licensed under the Apache License, Version 2.0 (the "License");
  4# you may not use this file except in compliance with the License.
  5# You may obtain a copy of the License at
  6#
  7#     http://www.apache.org/licenses/LICENSE-2.0
  8#
  9# Unless required by applicable law or agreed to in writing, software
 10# distributed under the License is distributed on an "AS IS" BASIS,
 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12# See the License for the specific language governing permissions and
 13# limitations under the License.
 14
 15from typing import Optional
 16
 17import numpy as np
 18
 19
 20class LinearSVC:
 21  """**Linear SVC classifier** implementation.
 22
 23  Linear SVMs use a linear decision boundary to separate the data points of
 24  different classes. When the data can be precisely linearly separated, linear
 25  SVMs are very suitable. This means that a single straight line (in 2D) or a
 26  hyperplane (in higher dimensions) can entirely divide the data points into
 27  their respective classes. A hyperplane that maximizes the margin between the
 28  classes is the decision boundary.
 29
 30  The objective of training a `LinearSVC` is to minimize the norm of the weight
 31  vector :math:`||w||`, which is the slope of the decision function.
 32
 33  .. math::
 34
 35    \\min_{{w, b}} \\dfrac{1}{2} w^{T}w = \\dfrac{1}{2} ||w||^{2} 
 36
 37
 38  .. math::
 39
 40    \\mathrm{subject\\ to}\\ t^{i} (w^{T}x^{i} + b) \\ge 1
 41
 42  .. note::
 43
 44    We are minimizing :math:`\\dfrac{1}{2}w^{T}w`, which is equal to
 45    :math:`\\dfrac{1}{2}||w||^{2}`, rather than minimizing :math:`||w||`.
 46    Indeed, :math:`\\dfrac{1}{2}||w||^{2}` has a nice and simple derivative
 47    (just :math:`w`) while :math:`||w||` is not differentiable at :math:`w = 0`.
 48    Optimization algorithms work much better on differentiable functions.
 49
 50  The cost function used by **Linear SVM classifier** is the following:
 51
 52  .. math::
 53
 54    J(w, b) = \\dfrac{1}{2}w^{T}w + C\\sum_{i=1}^{m}\\max(
 55      0,
 56      1 - t^{i}(w^{T}x^{i} + b)
 57    )
 58
 59  The loss function used is the **Hinge Loss** function that clips the value at
 60  :math:`0`:
 61
 62  .. math::
 63
 64    \\max(0, 1 - t^{i}(w^{T}x^{i} + b))
 65  """
 66
 67  def __init__(
 68    self,
 69    *,
 70    alpha: Optional[np.float16] = .01,
 71    lambda_p: Optional[np.float32] = .01,
 72    n_iters: Optional[np.int64] = 1_000
 73  ):
 74    """Initializes model's `alpha`, `lambda parameter` and number of
 75    `iterations`.
 76
 77    Args:
 78      alpha: Model's learning rate. High value might over shoot the minimum
 79        loss, while low values might make the model to take forever to learn.
 80      lambda_p: Lambda parameter for updating weights.
 81      n_iters: Maximum number of updations to make over the weights and bias in
 82        order to reach to a effecient prediction that minimizes the loss.
 83    """
 84    self._alpha = alpha
 85    self._lambda_p = lambda_p
 86    self._n_iters = n_iters
 87
 88    self._bias = None
 89    self._weights = None
 90
 91  def fit(self, X: np.ndarray, y: np.ndarray) -> 'LinearSVC':
 92    """Fit `LinearSVC` according to X, y.
 93
 94    The objective of training a `LinearSVC` is to minimize the norm of the
 95    weight vector :math:`||w||`, which is the slope of the decision function.
 96
 97    .. math::
 98
 99      \\min_{{w, b}} \\dfrac{1}{2} w^{T}w = \\dfrac{1}{2} ||w||^{2} 
100
101
102    .. math::
103
104      \\mathrm{subject\\ to}\\ t^{i} (w^{T}x^{i} + b) \\ge 1
105
106    .. note::
107
108      We are minimizing :math:`\\dfrac{1}{2}w^{T}w`, which is equal to
109      :math:`\\dfrac{1}{2}||w||^{2}`, rather than minimizing :math:`||w||`.
110      Indeed, :math:`\\dfrac{1}{2}||w||^{2}` has a nice and simple derivative
111      (just :math:`w`) while :math:`||w||` is not differentiable at
112      :math:`w = 0`. Optimization algorithms work much better on differentiable
113      functions.
114
115    The cost function used by **Linear SVM classifier** is the following:
116
117    .. math::
118
119      J(w, b) = \\dfrac{1}{2}w^{T}w + C\\sum_{i=1}^{m}\\max(
120        0,
121        1 - t^{i}(w^{T}x^{i} + b)
122      )
123
124    The loss function used is the **Hinge Loss** function that clips the value
125    at :math:`0`:
126
127    .. math::
128
129      \\max(0, 1 - t^{i}(w^{T}x^{i} + b))
130
131    Args:
132      X: Training vectors, where `n_samples` is the number of samples and
133        `n_features` is the number of features.
134      y: Target values.
135
136    Returns:
137      Returns the instance itself.
138    """
139    self._bias = 0
140    self._weights = np.random.rand(X.shape[1])
141
142    # We need the decision function to be greater than 1 for all positive
143    # training instances, and lower than -1 for all negative training instances.
144    # If we define ``t_i = -1`` for negative instances, and ``t_i = 1`` for
145    # positive instances, then we can acheive less margin violation.
146    t = np.where(y <= 0, -1, 1)
147    for _ in range(self._n_iters):
148      for idx, x_i in enumerate(X):
149        # Since, cost function is different at different decision boundaries we
150        # calculate weights and bias differently.
151        if t[idx] * (np.dot(x_i, self._weights) - self._bias) >= 1:
152          self._weights -= self._alpha * (2 * self._lambda_p * self._weights)
153        else:
154          self._weights -= self._alpha * (
155            2 * self._lambda_p * self._weights - np.dot(t[idx], x_i)
156          )
157          self._bias -= self._alpha * t[idx]
158    return self
159
160  def predict(self, X: np.ndarray) -> np.ndarray:
161    """Predict for `X` using the previously calculated `weights` and `bias`.
162
163    Args:
164      X: Feature vector.
165
166    Returns:
167      Target vector.
168
169    Raises:
170      RuntimeError: If `predict` is called before `fit`.
171      ValueError: If shape of the given `X` differs from the shape of the `X`
172        given to the `fit` function.
173    """
174    if self._weights is None or self._bias is None:
175      raise RuntimeError(
176        f'{self.__class__.__name__}: predict called before fitting data'
177      )
178
179    if X.shape[1] != self._weights.shape[0]:
180      raise ValueError(
181        (
182          f'Number of features {X.shape[1]} does not match previous data '
183          f'{self._weights.shape[0]}.'
184        )
185      )
186
187    return np.sign(np.dot(X, self._weights) - self._bias)
class LinearSVC:
 21class LinearSVC:
 22  """**Linear SVC classifier** implementation.
 23
 24  Linear SVMs use a linear decision boundary to separate the data points of
 25  different classes. When the data can be precisely linearly separated, linear
 26  SVMs are very suitable. This means that a single straight line (in 2D) or a
 27  hyperplane (in higher dimensions) can entirely divide the data points into
 28  their respective classes. A hyperplane that maximizes the margin between the
 29  classes is the decision boundary.
 30
 31  The objective of training a `LinearSVC` is to minimize the norm of the weight
 32  vector :math:`||w||`, which is the slope of the decision function.
 33
 34  .. math::
 35
 36    \\min_{{w, b}} \\dfrac{1}{2} w^{T}w = \\dfrac{1}{2} ||w||^{2} 
 37
 38
 39  .. math::
 40
 41    \\mathrm{subject\\ to}\\ t^{i} (w^{T}x^{i} + b) \\ge 1
 42
 43  .. note::
 44
 45    We are minimizing :math:`\\dfrac{1}{2}w^{T}w`, which is equal to
 46    :math:`\\dfrac{1}{2}||w||^{2}`, rather than minimizing :math:`||w||`.
 47    Indeed, :math:`\\dfrac{1}{2}||w||^{2}` has a nice and simple derivative
 48    (just :math:`w`) while :math:`||w||` is not differentiable at :math:`w = 0`.
 49    Optimization algorithms work much better on differentiable functions.
 50
 51  The cost function used by **Linear SVM classifier** is the following:
 52
 53  .. math::
 54
 55    J(w, b) = \\dfrac{1}{2}w^{T}w + C\\sum_{i=1}^{m}\\max(
 56      0,
 57      1 - t^{i}(w^{T}x^{i} + b)
 58    )
 59
 60  The loss function used is the **Hinge Loss** function that clips the value at
 61  :math:`0`:
 62
 63  .. math::
 64
 65    \\max(0, 1 - t^{i}(w^{T}x^{i} + b))
 66  """
 67
 68  def __init__(
 69    self,
 70    *,
 71    alpha: Optional[np.float16] = .01,
 72    lambda_p: Optional[np.float32] = .01,
 73    n_iters: Optional[np.int64] = 1_000
 74  ):
 75    """Initializes model's `alpha`, `lambda parameter` and number of
 76    `iterations`.
 77
 78    Args:
 79      alpha: Model's learning rate. High value might over shoot the minimum
 80        loss, while low values might make the model to take forever to learn.
 81      lambda_p: Lambda parameter for updating weights.
 82      n_iters: Maximum number of updations to make over the weights and bias in
 83        order to reach to a effecient prediction that minimizes the loss.
 84    """
 85    self._alpha = alpha
 86    self._lambda_p = lambda_p
 87    self._n_iters = n_iters
 88
 89    self._bias = None
 90    self._weights = None
 91
 92  def fit(self, X: np.ndarray, y: np.ndarray) -> 'LinearSVC':
 93    """Fit `LinearSVC` according to X, y.
 94
 95    The objective of training a `LinearSVC` is to minimize the norm of the
 96    weight vector :math:`||w||`, which is the slope of the decision function.
 97
 98    .. math::
 99
100      \\min_{{w, b}} \\dfrac{1}{2} w^{T}w = \\dfrac{1}{2} ||w||^{2} 
101
102
103    .. math::
104
105      \\mathrm{subject\\ to}\\ t^{i} (w^{T}x^{i} + b) \\ge 1
106
107    .. note::
108
109      We are minimizing :math:`\\dfrac{1}{2}w^{T}w`, which is equal to
110      :math:`\\dfrac{1}{2}||w||^{2}`, rather than minimizing :math:`||w||`.
111      Indeed, :math:`\\dfrac{1}{2}||w||^{2}` has a nice and simple derivative
112      (just :math:`w`) while :math:`||w||` is not differentiable at
113      :math:`w = 0`. Optimization algorithms work much better on differentiable
114      functions.
115
116    The cost function used by **Linear SVM classifier** is the following:
117
118    .. math::
119
120      J(w, b) = \\dfrac{1}{2}w^{T}w + C\\sum_{i=1}^{m}\\max(
121        0,
122        1 - t^{i}(w^{T}x^{i} + b)
123      )
124
125    The loss function used is the **Hinge Loss** function that clips the value
126    at :math:`0`:
127
128    .. math::
129
130      \\max(0, 1 - t^{i}(w^{T}x^{i} + b))
131
132    Args:
133      X: Training vectors, where `n_samples` is the number of samples and
134        `n_features` is the number of features.
135      y: Target values.
136
137    Returns:
138      Returns the instance itself.
139    """
140    self._bias = 0
141    self._weights = np.random.rand(X.shape[1])
142
143    # We need the decision function to be greater than 1 for all positive
144    # training instances, and lower than -1 for all negative training instances.
145    # If we define ``t_i = -1`` for negative instances, and ``t_i = 1`` for
146    # positive instances, then we can acheive less margin violation.
147    t = np.where(y <= 0, -1, 1)
148    for _ in range(self._n_iters):
149      for idx, x_i in enumerate(X):
150        # Since, cost function is different at different decision boundaries we
151        # calculate weights and bias differently.
152        if t[idx] * (np.dot(x_i, self._weights) - self._bias) >= 1:
153          self._weights -= self._alpha * (2 * self._lambda_p * self._weights)
154        else:
155          self._weights -= self._alpha * (
156            2 * self._lambda_p * self._weights - np.dot(t[idx], x_i)
157          )
158          self._bias -= self._alpha * t[idx]
159    return self
160
161  def predict(self, X: np.ndarray) -> np.ndarray:
162    """Predict for `X` using the previously calculated `weights` and `bias`.
163
164    Args:
165      X: Feature vector.
166
167    Returns:
168      Target vector.
169
170    Raises:
171      RuntimeError: If `predict` is called before `fit`.
172      ValueError: If shape of the given `X` differs from the shape of the `X`
173        given to the `fit` function.
174    """
175    if self._weights is None or self._bias is None:
176      raise RuntimeError(
177        f'{self.__class__.__name__}: predict called before fitting data'
178      )
179
180    if X.shape[1] != self._weights.shape[0]:
181      raise ValueError(
182        (
183          f'Number of features {X.shape[1]} does not match previous data '
184          f'{self._weights.shape[0]}.'
185        )
186      )
187
188    return np.sign(np.dot(X, self._weights) - self._bias)

Linear SVC classifier implementation.

Linear SVMs use a linear decision boundary to separate the data points of different classes. When the data can be precisely linearly separated, linear SVMs are very suitable. This means that a single straight line (in 2D) or a hyperplane (in higher dimensions) can entirely divide the data points into their respective classes. A hyperplane that maximizes the margin between the classes is the decision boundary.

The objective of training a LinearSVC is to minimize the norm of the weight vector \( ||w|| \), which is the slope of the decision function.

$$\min_{{w, b}} \dfrac{1}{2} w^{T}w = \dfrac{1}{2} ||w||^{2}$$

$$\mathrm{subject\ to}\ t^{i} (w^{T}x^{i} + b) \ge 1$$

We are minimizing \( \dfrac{1}{2}w^{T}w \), which is equal to \( \dfrac{1}{2}||w||^{2} \), rather than minimizing \( ||w|| \). Indeed, \( \dfrac{1}{2}||w||^{2} \) has a nice and simple derivative (just \( w \)) while \( ||w|| \) is not differentiable at \( w = 0 \). Optimization algorithms work much better on differentiable functions.

The cost function used by Linear SVM classifier is the following:

$$J(w, b) = \dfrac{1}{2}w^{T}w + C\sum_{i=1}^{m}\max( 0, 1 - t^{i}(w^{T}x^{i} + b) )$$

The loss function used is the Hinge Loss function that clips the value at \( 0 \):

$$\max(0, 1 - t^{i}(w^{T}x^{i} + b))$$

LinearSVC( *, alpha: Optional[numpy.float16] = 0.01, lambda_p: Optional[numpy.float32] = 0.01, n_iters: Optional[numpy.int64] = 1000)
68  def __init__(
69    self,
70    *,
71    alpha: Optional[np.float16] = .01,
72    lambda_p: Optional[np.float32] = .01,
73    n_iters: Optional[np.int64] = 1_000
74  ):
75    """Initializes model's `alpha`, `lambda parameter` and number of
76    `iterations`.
77
78    Args:
79      alpha: Model's learning rate. High value might over shoot the minimum
80        loss, while low values might make the model to take forever to learn.
81      lambda_p: Lambda parameter for updating weights.
82      n_iters: Maximum number of updations to make over the weights and bias in
83        order to reach to a effecient prediction that minimizes the loss.
84    """
85    self._alpha = alpha
86    self._lambda_p = lambda_p
87    self._n_iters = n_iters
88
89    self._bias = None
90    self._weights = None

Initializes model's alpha, lambda parameter and number of iterations.

Arguments:
  • alpha: Model's learning rate. High value might over shoot the minimum loss, while low values might make the model to take forever to learn.
  • lambda_p: Lambda parameter for updating weights.
  • n_iters: Maximum number of updations to make over the weights and bias in order to reach to a effecient prediction that minimizes the loss.
def fit(self, X: numpy.ndarray, y: numpy.ndarray) -> LinearSVC:
 92  def fit(self, X: np.ndarray, y: np.ndarray) -> 'LinearSVC':
 93    """Fit `LinearSVC` according to X, y.
 94
 95    The objective of training a `LinearSVC` is to minimize the norm of the
 96    weight vector :math:`||w||`, which is the slope of the decision function.
 97
 98    .. math::
 99
100      \\min_{{w, b}} \\dfrac{1}{2} w^{T}w = \\dfrac{1}{2} ||w||^{2} 
101
102
103    .. math::
104
105      \\mathrm{subject\\ to}\\ t^{i} (w^{T}x^{i} + b) \\ge 1
106
107    .. note::
108
109      We are minimizing :math:`\\dfrac{1}{2}w^{T}w`, which is equal to
110      :math:`\\dfrac{1}{2}||w||^{2}`, rather than minimizing :math:`||w||`.
111      Indeed, :math:`\\dfrac{1}{2}||w||^{2}` has a nice and simple derivative
112      (just :math:`w`) while :math:`||w||` is not differentiable at
113      :math:`w = 0`. Optimization algorithms work much better on differentiable
114      functions.
115
116    The cost function used by **Linear SVM classifier** is the following:
117
118    .. math::
119
120      J(w, b) = \\dfrac{1}{2}w^{T}w + C\\sum_{i=1}^{m}\\max(
121        0,
122        1 - t^{i}(w^{T}x^{i} + b)
123      )
124
125    The loss function used is the **Hinge Loss** function that clips the value
126    at :math:`0`:
127
128    .. math::
129
130      \\max(0, 1 - t^{i}(w^{T}x^{i} + b))
131
132    Args:
133      X: Training vectors, where `n_samples` is the number of samples and
134        `n_features` is the number of features.
135      y: Target values.
136
137    Returns:
138      Returns the instance itself.
139    """
140    self._bias = 0
141    self._weights = np.random.rand(X.shape[1])
142
143    # We need the decision function to be greater than 1 for all positive
144    # training instances, and lower than -1 for all negative training instances.
145    # If we define ``t_i = -1`` for negative instances, and ``t_i = 1`` for
146    # positive instances, then we can acheive less margin violation.
147    t = np.where(y <= 0, -1, 1)
148    for _ in range(self._n_iters):
149      for idx, x_i in enumerate(X):
150        # Since, cost function is different at different decision boundaries we
151        # calculate weights and bias differently.
152        if t[idx] * (np.dot(x_i, self._weights) - self._bias) >= 1:
153          self._weights -= self._alpha * (2 * self._lambda_p * self._weights)
154        else:
155          self._weights -= self._alpha * (
156            2 * self._lambda_p * self._weights - np.dot(t[idx], x_i)
157          )
158          self._bias -= self._alpha * t[idx]
159    return self

Fit LinearSVC according to X, y.

The objective of training a LinearSVC is to minimize the norm of the weight vector \( ||w|| \), which is the slope of the decision function.

$$\min_{{w, b}} \dfrac{1}{2} w^{T}w = \dfrac{1}{2} ||w||^{2}$$

$$\mathrm{subject\ to}\ t^{i} (w^{T}x^{i} + b) \ge 1$$

We are minimizing \( \dfrac{1}{2}w^{T}w \), which is equal to \( \dfrac{1}{2}||w||^{2} \), rather than minimizing \( ||w|| \). Indeed, \( \dfrac{1}{2}||w||^{2} \) has a nice and simple derivative (just \( w \)) while \( ||w|| \) is not differentiable at \( w = 0 \). Optimization algorithms work much better on differentiable functions.

The cost function used by Linear SVM classifier is the following:

$$J(w, b) = \dfrac{1}{2}w^{T}w + C\sum_{i=1}^{m}\max( 0, 1 - t^{i}(w^{T}x^{i} + b) )$$

The loss function used is the Hinge Loss function that clips the value at \( 0 \):

$$\max(0, 1 - t^{i}(w^{T}x^{i} + b))$$

Arguments:
  • X: Training vectors, where n_samples is the number of samples and n_features is the number of features.
  • y: Target values.
Returns:

Returns the instance itself.

def predict(self, X: numpy.ndarray) -> numpy.ndarray:
161  def predict(self, X: np.ndarray) -> np.ndarray:
162    """Predict for `X` using the previously calculated `weights` and `bias`.
163
164    Args:
165      X: Feature vector.
166
167    Returns:
168      Target vector.
169
170    Raises:
171      RuntimeError: If `predict` is called before `fit`.
172      ValueError: If shape of the given `X` differs from the shape of the `X`
173        given to the `fit` function.
174    """
175    if self._weights is None or self._bias is None:
176      raise RuntimeError(
177        f'{self.__class__.__name__}: predict called before fitting data'
178      )
179
180    if X.shape[1] != self._weights.shape[0]:
181      raise ValueError(
182        (
183          f'Number of features {X.shape[1]} does not match previous data '
184          f'{self._weights.shape[0]}.'
185        )
186      )
187
188    return np.sign(np.dot(X, self._weights) - self._bias)

Predict for X using the previously calculated weights and bias.

Arguments:
  • X: Feature vector.
Returns:

Target vector.

Raises:
  • RuntimeError: If predict is called before fit.
  • ValueError: If shape of the given X differs from the shape of the X given to the fit function.