// Copyright (C) 2010 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_OPTIMIZATION_SOLVE_QP_UsING_SMO_H__
#define DLIB_OPTIMIZATION_SOLVE_QP_UsING_SMO_H__
#include "optimization_solve_qp_using_smo_abstract.h"
#include "../matrix.h"
namespace dlib
{
// ----------------------------------------------------------------------------------------
/*
The algorithm defined in the solve_qp_using_smo() function below can be
derived by using an important theorem from the theory of constrained optimization.
This theorem tells us that any optimal point of a constrained function must
satisfy what are called the KKT conditions (also sometimes called just the KT
conditions, especially in older literature). A very good book to consult
regarding this topic is Practical Methods of Optimization (second edition) by
R. Fletcher. Below I will try to explain the general idea of how this is
applied.
Let e == ones_matrix(alpha.size(),1)
First, note that the function below solves the following quadratic program.
Minimize: f(alpha) == 0.5*trans(alpha)*Q*alpha - trans(alpha)*b
subject to the following constraints:
- trans(e)*alpha == C (i.e. the sum of alpha values doesn't change)
- min(alpha) >= 0 (i.e. all alpha values are nonnegative)
Where f is convex. This means that Q should be positive-semidefinite.
To get from this problem formulation to the algorithm below we have to
consider the KKT conditions. They tell us that any solution to the above
problem must satisfy the following 5 conditions:
1. trans(e)*alpha == C
2. min(alpha) >= 0
3. Let L(alpha, x, y) == f(alpha) - trans(x)*alpha - y*(trans(e)*alpha - C)
Where x is a vector of length alpha.size() and y is a single scalar.
Then the derivative of L with respect to alpha must == 0
So we get the following as our 3rd condition:
f'(alpha) - x - y*e == 0
4. min(x) >= 0 (i.e. all x values are nonnegative)
5. pointwise_multiply(x, alpha) == 0
(i.e. only one member of each x(i) and alpha(i) pair can be non-zero)
From 3 we can easily obtain this rule:
for all i: f'(alpha)(i) - x(i) == y
If we then consider 4 and 5 we see that we can infer that the following
must also be the case:
- if (alpha(i) > 0) then
- x(i) == 0
- f'(alpha)(i) == y
- else
- x(i) == some nonnegative number
- f'(alpha)(i) >= y
The important thing to take away is the final rule. It tells us that at the
optimal solution all elements of the gradient of f have the same value if
their corresponding alpha is non-zero. It also tells us that all the other
gradient values are bigger than y. We can use this information to help us
pick which alpha variables to optimize at each iteration.
*/
// ----------------------------------------------------------------------------------------
template <
typename EXP1,
typename EXP2,
typename T, long NR, long NC, typename MM, typename L
>
unsigned long solve_qp_using_smo (
const matrix_exp<EXP1>& Q,
const matrix_exp<EXP2>& b,
matrix<T,NR,NC,MM,L>& alpha,
T eps,
unsigned long max_iter
)
{
// make sure requires clause is not broken
DLIB_ASSERT(Q.nr() == Q.nc() &&
is_col_vector(b) &&
is_col_vector(alpha) &&
b.size() == alpha.size() &&
b.size() == Q.nr() &&
alpha.size() > 0 &&
min(alpha) >= 0 &&
eps > 0 &&
max_iter > 0,
"\t void solve_qp_using_smo()"
<< "\n\t Invalid arguments were given to this function"
<< "\n\t Q.nr(): " << Q.nr()
<< "\n\t Q.nc(): " << Q.nc()
<< "\n\t is_col_vector(b): " << is_col_vector(b)
<< "\n\t is_col_vector(alpha): " << is_col_vector(alpha)
<< "\n\t b.size(): " << b.size()
<< "\n\t alpha.size(): " << alpha.size()
<< "\n\t Q.nr(): " << Q.nr()
<< "\n\t min(alpha): " << min(alpha)
<< "\n\t eps: " << eps
<< "\n\t max_iter: " << max_iter
);
const T C = sum(alpha);
// Compute f'(alpha) (i.e. the gradient of f(alpha)) for the current alpha.
matrix<T,NR,NC,MM,L> df = Q*alpha - b;
const T tau = 1000*std::numeric_limits<T>::epsilon();
T big, little;
unsigned long iter = 0;
for (; iter < max_iter; ++iter)
{
// Find the two elements of df that satisfy the following:
// - little_idx == index_of_min(df)
// - big_idx == the index of the largest element in df such that alpha(big_idx) > 0
// These two indices will tell us which two alpha values are most in violation of the KKT
// optimality conditions.
big = -std::numeric_limits<T>::max();
long big_idx = 0;
little = std::numeric_limits<T>::max();
long little_idx = 0;
for (long i = 0; i < df.nr(); ++i)
{
if (df(i) > big && alpha(i) > 0)
{
big = df(i);
big_idx = i;
}
if (df(i) < little)
{
little = df(i);
little_idx = i;
}
}
// Check if the KKT conditions are still violated and stop if so.
//if (alpha(little_idx) > 0 && (big - little) < eps)
// break;
// Check how big the duality gap is and stop when it goes below eps.
// The duality gap is the gap between the objective value of the function
// we are optimizing and the value of it's primal form. This value is always
// greater than or equal to the distance to the optimum solution so it is a
// good way to decide if we should stop. See the book referenced above for
// more information. In particular, see the part about the Wolfe Dual.
if (trans(alpha)*df - C*little < eps)
break;
// Save these values, we will need them later.
const T old_alpha_big = alpha(big_idx);
const T old_alpha_little = alpha(little_idx);
// Now optimize the two variables we just picked.
T quad_coef = Q(big_idx,big_idx) + Q(little_idx,little_idx) - 2*Q(big_idx, little_idx);
if (quad_coef <= tau)
quad_coef = tau;
const T delta = (big - little)/quad_coef;
alpha(big_idx) -= delta;
alpha(little_idx) += delta;
// Make sure alpha stays feasible. That is, make sure the updated alpha doesn't
// violate the non-negativity constraint.
if (alpha(big_idx) < 0)
{
// Since an alpha can't be negative we will just set it to 0 and shift all the
// weight to the other alpha.
alpha(big_idx) = 0;
alpha(little_idx) = old_alpha_big + old_alpha_little;
}
// Every 300 iterations
if ((iter%300) == 299)
{
// Perform this form of the update every so often because doing so can help
// avoid the buildup of numerical errors you get with the alternate update
// below.
df = Q*alpha - b;
}
else
{
// Now update the gradient. We will perform the equivalent of: df = Q*alpha - b;
const T delta_alpha_big = alpha(big_idx) - old_alpha_big;
const T delta_alpha_little = alpha(little_idx) - old_alpha_little;
for(long k = 0; k < df.nr(); ++k)
df(k) += Q(big_idx,k)*delta_alpha_big + Q(little_idx,k)*delta_alpha_little;;
}
}
/*
using namespace std;
cout << "SMO: " << endl;
cout << " duality gap: "<< trans(alpha)*df - C*min(df) << endl;
cout << " KKT gap: "<< big-little << endl;
cout << " iter: "<< iter+1 << endl;
cout << " eps: "<< eps << endl;
*/
return iter+1;
}
// ----------------------------------------------------------------------------------------
}
#endif // DLIB_OPTIMIZATION_SOLVE_QP_UsING_SMO_H__