// Copyright (C) 2010 Davis E. King (davis@dlib.net)
// License: Boost Software License See LICENSE.txt for the full license.
#ifndef DLIB_LIBSVM_iO_H__
#define DLIB_LIBSVM_iO_H__
#include "libsvm_io_abstract.h"
#include <fstream>
#include <string>
#include <utility>
#include "../algs.h"
#include "../matrix.h"
#include "../string.h"
#include <vector>
namespace dlib
{
struct sample_data_io_error : public error
{
sample_data_io_error(const std::string& message): error(message) {}
};
// ----------------------------------------------------------------------------------------
namespace impl
{
template <typename T> struct strip_const { typedef T type; };
template <typename T> struct strip_const<const T> { typedef T type; };
template <typename T> struct strip_const<const T&> { typedef T type; };
}
template <typename sample_type, typename label_type, typename alloc1, typename alloc2>
void load_libsvm_formatted_data (
const std::string& file_name,
std::vector<sample_type, alloc1>& samples,
std::vector<label_type, alloc2>& labels
)
{
using namespace std;
typedef typename sample_type::value_type pair_type;
typedef typename impl::strip_const<typename pair_type::first_type>::type key_type;
typedef typename pair_type::second_type value_type;
// You must use unsigned integral key types in your sparse vectors
COMPILE_TIME_ASSERT(is_unsigned_type<key_type>::value);
samples.clear();
ifstream fin(file_name.c_str());
if (!fin)
throw sample_data_io_error("Unable to open file " + file_name);
string line;
istringstream sin;
key_type key;
value_type value;
label_type label;
sample_type sample;
long line_num = 0;
while (fin.peek() != EOF)
{
++line_num;
getline(fin, line);
string::size_type pos = line.find_first_not_of(" \t\r\n");
// ignore empty lines or comment lines
if (pos == string::npos || line[pos] == '#')
continue;
sin.clear();
sin.str(line);
sample.clear();
sin >> label;
if (!sin)
throw sample_data_io_error("On line: " + cast_to_string(line_num) + ", error while reading file " + file_name );
// eat whitespace
sin >> ws;
while (sin.peek() != EOF && sin.peek() != '#')
{
sin >> key >> ws;
// ignore what should be a : character
if (sin.get() != ':')
throw sample_data_io_error("On line: " + cast_to_string(line_num) + ", error while reading file " + file_name);
sin >> value >> ws;
if (sin && value != 0)
{
sample.insert(sample.end(), make_pair(key, value));
}
}
samples.push_back(sample);
labels.push_back(label);
}
}
// ----------------------------------------------------------------------------------------
// This is an overload for sparse vectors
template <typename sample_type, typename label_type, typename alloc1, typename alloc2>
typename disable_if<is_matrix<sample_type>,void>::type save_libsvm_formatted_data (
const std::string& file_name,
const std::vector<sample_type, alloc1>& samples,
const std::vector<label_type, alloc2>& labels
)
{
typedef typename sample_type::value_type pair_type;
typedef typename impl::strip_const<typename pair_type::first_type>::type key_type;
// You must use unsigned integral key types in your sparse vectors
COMPILE_TIME_ASSERT(is_unsigned_type<key_type>::value);
// make sure requires clause is not broken
DLIB_ASSERT(samples.size() == labels.size(),
"\t void save_libsvm_formatted_data()"
<< "\n\t You have to have labels for each sample and vice versa"
<< "\n\t samples.size(): " << samples.size()
<< "\n\t labels.size(): " << labels.size()
);
using namespace std;
ofstream fout(file_name.c_str());
fout.precision(14);
if (!fout)
throw sample_data_io_error("Unable to open file " + file_name);
for (unsigned long i = 0; i < samples.size(); ++i)
{
fout << labels[i];
for (typename sample_type::const_iterator j = samples[i].begin(); j != samples[i].end(); ++j)
{
if (j->second != 0)
fout << " " << j->first << ":" << j->second;
}
fout << "\n";
if (!fout)
throw sample_data_io_error("Error while writing to file " + file_name);
}
}
// ----------------------------------------------------------------------------------------
// This is an overload for dense vectors
template <typename sample_type, typename label_type, typename alloc1, typename alloc2>
typename enable_if<is_matrix<sample_type>,void>::type save_libsvm_formatted_data (
const std::string& file_name,
const std::vector<sample_type, alloc1>& samples,
const std::vector<label_type, alloc2>& labels
)
{
// make sure requires clause is not broken
DLIB_ASSERT(samples.size() == labels.size(),
"\t void save_libsvm_formatted_data()"
<< "\n\t You have to have labels for each sample and vice versa"
<< "\n\t samples.size(): " << samples.size()
<< "\n\t labels.size(): " << labels.size()
);
using namespace std;
ofstream fout(file_name.c_str());
fout.precision(14);
if (!fout)
throw sample_data_io_error("Unable to open file " + file_name);
for (unsigned long i = 0; i < samples.size(); ++i)
{
fout << labels[i];
for (long j = 0; j < samples[i].size(); ++j)
{
if (samples[i](j) != 0)
fout << " " << j << ":" << samples[i](j);
}
fout << "\n";
if (!fout)
throw sample_data_io_error("Error while writing to file " + file_name);
}
}
// ----------------------------------------------------------------------------------------
template <typename sample_type, typename alloc>
std::vector<matrix<typename sample_type::value_type::second_type,0,1> > sparse_to_dense (
const std::vector<sample_type, alloc>& samples
)
{
typedef typename sample_type::value_type pair_type;
typedef typename impl::strip_const<typename pair_type::first_type>::type key_type;
// You must use unsigned integral key types in your sparse vectors
COMPILE_TIME_ASSERT(is_unsigned_type<key_type>::value);
typedef typename sample_type::value_type pair_type;
typedef typename impl::strip_const<typename pair_type::first_type>::type key_type;
typedef typename pair_type::second_type value_type;
std::vector< matrix<value_type,0,1> > result;
// do nothing if there aren't any samples
if (samples.size() == 0)
return result;
// figure out how many elements we need in our dense vectors.
unsigned long max_dim = 0;
for (unsigned long i = 0; i < samples.size(); ++i)
{
if (samples[i].size() > 0)
max_dim = std::max<unsigned long>(max_dim, (--samples[i].end())->first + 1);
}
// now turn all the samples into dense samples
result.resize(samples.size());
for (unsigned long i = 0; i < samples.size(); ++i)
{
result[i].set_size(max_dim);
result[i] = 0;
for (typename sample_type::const_iterator j = samples[i].begin(); j != samples[i].end(); ++j)
{
result[i](j->first) = j->second;
}
}
return result;
}
// ----------------------------------------------------------------------------------------
}
#endif // DLIB_LIBSVM_iO_H__