Commit e1498574 authored by André Anjos's avatar André Anjos 💬
Browse files

Implemented most of scatter functionality

parent 22f0ebef
......@@ -46,6 +46,7 @@ setup(
"xbob/math/linsolve.cpp",
"xbob/math/pavx.cpp",
"xbob/math/norminv.cpp",
"xbob/math/scatter.cpp",
"xbob/math/main.cpp",
],
packages = packages,
......
......@@ -17,6 +17,7 @@
#include "linsolve.h"
#include "pavx.h"
#include "norminv.h"
#include "scatter.h"
PyDoc_STRVAR(s_histogram_intersection_str, "histogram_intersection");
PyDoc_STRVAR(s_histogram_intersection_doc,
......@@ -263,6 +264,129 @@ It is equivalent as calling :py:func:`norminv(p, 0, 1)`. The value\n\
Reference: `<http://home.online.no/~pjacklam/notes/invnorm/>`_\n\
");
PyDoc_STRVAR(s_scatter_str, "scatter");
PyDoc_STRVAR(s_scatter_doc,
"scatter(a) -> (array, array)\n\
scatter(a, s) -> array\n\
scatter(a, s, m) -> None\n\
\n\
Computes the scatter matrix of a 2D array *considering data is organized\n\
row-wise* (each sample is a row, each feature is a column). The\n\
resulting array ``s`` is squared with extents equal to the\n\
number of columns in ``a``. The resulting array ``m`` is a 1D array\n\
with the row means of ``a``. This method supports only 32 or 64-bit\n\
float arrays as input.\n\
\n\
This function supports many calling modes, but you should provide, at\n\
least, the input data matrix ``a``. All non-provided arguments will be\n\
allocated internally and returned.\n\
");
PyDoc_STRVAR(s_scatter_nocheck_str, "scatter_");
PyDoc_STRVAR(s_scatter_nocheck_doc,
"scatter_(a, s, m) -> None\n\
\n\
Computes the scatter matrix of a 2D array *considering data is organized\n\
row-wise* (each sample is a row, each feature is a column). The\n\
resulting array ``s`` is squared with extents equal to the\n\
number of columns in ``a``. The resulting array ``m`` is a 1D array\n\
with the row means of ``a``. This method supports only 32 or 64-bit\n\
float arrays as input.\n\
\n\
.. warning::\n\
\n\
THIS VARIANT DOES NOT PERFORM ANY CHECKS ON THE INPUT MATRICES AND IS,\n\
FASTER THEN THE VARIANT NOT ENDING IN ``_``. Use it when you are sure\n\
your input matrices sizes match.\n\
");
PyDoc_STRVAR(s_scatters_str, "scatters");
PyDoc_STRVAR(s_scatters_doc,
"scatters(data) -> (array, array, array)\n\
scatters(data, sw, sb) -> array\n\
scatters(data, sw, sb, m) -> None\n\
\n\
Computes the within-class (``sw``) and between-class (``sb``) scatter\n\
matrices of a set of 2D arrays considering data is organized row-wise\n\
(each sample is a row, each feature is a column).\n\
\n\
This function supports many calling modes, but you should provide, at\n\
least, the input data matrices ``data``, which should be sequence of\n\
2D 32-bit or 64-bit float values in which every row of each matrix\n\
represents one observation for a given class. **Every matrix in\n\
``data`` should have exactly the same number of columns.**\n\
\n\
The returned values ``sw`` and ``sb`` are square matrices with the\n\
same number of rows and columns as the number of columns in ``data``.\n\
The returned value ``m`` (last call variant) is a 1D array with the\n\
same length as number of columns in each ``data`` matrix and represents\n\
the ensemble mean with no prior (i.e., biased towards classes with more\n\
samples.\n\
\n\
Strategy implemented:\n\
\n\
1. Evaluate the overall mean (``m``), class means (:math:`m_k`) and the\n\
total class counts (:math:`N`).\n\
2. Evaluate ``sw`` and ``sb`` using normal loops.\n\
\n\
Note that ``sw`` and ``sb``, in this implementation, will be normalized\n\
by N-1 (number of samples) and K (number of classes). This procedure\n\
makes the eigen values scaled by (N-1)/K, effectively increasing their\n\
values. The main motivation for this normalization are numerical\n\
precision concerns with the increasing number of samples causing a\n\
rather large Sw matrix. A normalization strategy mitigates this\n\
problem. The eigen vectors will see no effect on this normalization as\n\
they are normalized in the euclidean sense (:math:`||a|| = 1`) so that\n\
does not change those.\n\
");
PyDoc_STRVAR(s_scatters_nocheck_str, "scatters_");
PyDoc_STRVAR(s_scatters_nocheck_doc,
"scatters_(data, sw, sb) -> None\n\
scatters_(data, sw, sb, m) -> None\n\
\n\
Computes the within-class (``sw``) and between-class (``sb``) scatter\n\
matrices of a set of 2D arrays considering data is organized row-wise\n\
(each sample is a row, each feature is a column).\n\
\n\
.. warning::\n\
\n\
THIS VARIANT DOES NOT PERFORM ANY CHECKS ON THE INPUT MATRICES AND IS,\n\
FASTER THEN THE VARIANT NOT ENDING IN ``_``. Use it when you are sure\n\
your input matrices sizes match.\n\
\n\
This function supports many calling modes, but you should provide, at\n\
least, the input data matrices ``data``, which should be sequence of\n\
2D 32-bit or 64-bit float values in which every row of each matrix\n\
represents one observation for a given class. **Every matrix in\n\
``data`` should have exactly the same number of columns.**\n\
\n\
The returned values ``sw`` and ``sb`` are square matrices with the\n\
same number of rows and columns as the number of columns in ``data``.\n\
The returned value ``m`` (last call variant) is a 1D array with the\n\
same length as number of columns in each ``data`` matrix and represents\n\
the ensemble mean with no prior (i.e., biased towards classes with more\n\
samples. **In this variant, you should pre-allocate all output matrices\n\
so that scatters (and the overall mean) are stored on your provided\n\
arrays**.\n\
\n\
Strategy implemented:\n\
\n\
1. Evaluate the overall mean (``m``), class means (:math:`m_k`) and the\n\
total class counts (:math:`N`).\n\
2. Evaluate ``sw`` and ``sb`` using normal loops.\n\
\n\
Note that ``sw`` and ``sb``, in this implementation, will be normalized\n\
by N-1 (number of samples) and K (number of classes). This procedure\n\
makes the eigen values scaled by (N-1)/K, effectively increasing their\n\
values. The main motivation for this normalization are numerical\n\
precision concerns with the increasing number of samples causing a\n\
rather large Sw matrix. A normalization strategy mitigates this\n\
problem. The eigen vectors will see no effect on this normalization as\n\
they are normalized in the euclidean sense (:math:`||a|| = 1`) so that\n\
does not change those.\n\
");
static PyMethodDef module_methods[] = {
{
s_histogram_intersection_str,
......@@ -354,6 +478,30 @@ static PyMethodDef module_methods[] = {
METH_VARARGS|METH_KEYWORDS,
s_normsinv_doc
},
{
s_scatter_str,
(PyCFunction)py_scatter,
METH_VARARGS|METH_KEYWORDS,
s_scatter_doc
},
{
s_scatter_nocheck_str,
(PyCFunction)py_scatter_nocheck,
METH_VARARGS|METH_KEYWORDS,
s_scatter_nocheck_doc
},
{
s_scatters_str,
(PyCFunction)py_scatters,
METH_VARARGS|METH_KEYWORDS,
s_scatters_doc
},
{
s_scatters_nocheck_str,
(PyCFunction)py_scatters_nocheck,
METH_VARARGS|METH_KEYWORDS,
s_scatters_nocheck_doc
},
{0} /* Sentinel */
};
......
/**
* @date Mon Jun 20 11:47:58 2011 +0200
* @author Andre Anjos <andre.anjos@idiap.ch>
*
* @brief Python bindings to statistical methods
*
* Copyright (C) 2011-2013 Idiap Research Institute, Martigny, Switzerland
*/
#include "scatter.h"
#include <xbob.blitz/cppapi.h>
#include <bob/math/stats.h>
PyObject* py_scatter (PyObject*, PyObject* args, PyObject* kwds) {
/* Parses input arguments in a single shot */
static const char* const_kwlist[] = { "a", "s", "m", 0 /* Sentinel */ };
static char** kwlist = const_cast<char**>(const_kwlist);
PyBlitzArrayObject* a = 0;
PyBlitzArrayObject* s = 0;
PyBlitzArrayObject* m = 0;
if (!PyArg_ParseTupleAndKeywords(args, kwds, "O&|O&O&",
kwlist,
&PyBlitzArray_Converter, &a,
&PyBlitzArray_OutputConverter, &s,
&PyBlitzArray_OutputConverter, &m
)) return 0;
// basic checks
if (a->ndim != 2 || (a->type_num != NPY_FLOAT32 && a->type_num != NPY_FLOAT64)) {
PyErr_SetString(PyExc_TypeError, "input data matrix `a' should be either a 32 or 64-bit float 2D array");
Py_DECREF(a);
Py_XDECREF(s);
Py_XDECREF(m);
return 0;
}
if (s && (s->ndim != 2 || (s->type_num != a->type_num))) {
PyErr_SetString(PyExc_TypeError, "output data matrix `s' should be either a 32 or 64-bit float 2D array, matching the data type of `a'");
Py_DECREF(a);
Py_DECREF(s);
Py_XDECREF(m);
}
if (m && (m->ndim != 1 || (m->type_num != a->type_num))) {
PyErr_SetString(PyExc_TypeError, "output data vector `m' should be either a 32 or 64-bit float 1D array, matching the data type of `a'");
Py_DECREF(a);
Py_XDECREF(s);
Py_DECREF(m);
}
// allocates data not passed by the user
bool user_s = s;
if (!s) {
Py_ssize_t sshape[2] = {a->shape[1], a->shape[1]};
s = (PyBlitzArrayObject*)PyBlitzArray_SimpleNew(a->type_num, 2, sshape);
}
bool user_m = m;
if (!m) m = (PyBlitzArrayObject*)PyBlitzArray_SimpleNew(a->type_num, 1, &a->shape[1]);
try {
switch (a->type_num) {
case NPY_FLOAT32:
bob::math::scatter(
*PyBlitzArrayCxx_AsBlitz<float,2>(a),
*PyBlitzArrayCxx_AsBlitz<float,2>(s),
*PyBlitzArrayCxx_AsBlitz<float,1>(m)
);
break;
case NPY_FLOAT64:
bob::math::scatter(
*PyBlitzArrayCxx_AsBlitz<double,2>(a),
*PyBlitzArrayCxx_AsBlitz<double,2>(s),
*PyBlitzArrayCxx_AsBlitz<double,1>(m)
);
break;
default:
PyErr_Format(PyExc_TypeError, "scatter calculation currently not implemented for type '%s'", PyBlitzArray_TypenumAsString(a->type_num));
}
}
catch (std::exception& e) {
PyErr_SetString(PyExc_RuntimeError, e.what());
}
catch (...) {
PyErr_SetString(PyExc_RuntimeError, "scatter calculation failed: unknown exception caught");
}
Py_DECREF(a);
if (PyErr_Occurred()) {
Py_DECREF(s);
Py_DECREF(m);
return 0;
}
int returns = 2 - (user_s + user_m);
if (!returns) {
Py_DECREF(s);
Py_DECREF(m);
Py_RETURN_NONE;
}
// if you get here, it means the user is interested on stuff to be returned
PyObject* retval = PyTuple_New(returns);
if (!retval) {
Py_DECREF(s);
Py_DECREF(m);
}
// fill from the back
if (!user_m) PyTuple_SET_ITEM(--retval, returns, (PyObject*)m);
else Py_DECREF(m);
if (!user_s) PyTuple_SET_ITEM(--retval, returns, (PyObject*)s);
else Py_DECREF(s);
return retval;
}
PyObject* py_scatter_nocheck (PyObject*, PyObject* args, PyObject* kwds) {
/* Parses input arguments in a single shot */
static const char* const_kwlist[] = { "a", "s", "m", 0 /* Sentinel */ };
static char** kwlist = const_cast<char**>(const_kwlist);
PyBlitzArrayObject* a = 0;
PyBlitzArrayObject* s = 0;
PyBlitzArrayObject* m = 0;
if (!PyArg_ParseTupleAndKeywords(args, kwds, "O&O&O&",
kwlist,
&PyBlitzArray_Converter, &a,
&PyBlitzArray_OutputConverter, &s,
&PyBlitzArray_OutputConverter, &m
)) return 0;
// basic checks
if (a->ndim != 2 || (a->type_num != NPY_FLOAT32 && a->type_num != NPY_FLOAT64)) {
PyErr_SetString(PyExc_TypeError, "input data matrix `a' should be either a 32 or 64-bit float 2D array");
Py_DECREF(a);
Py_DECREF(s);
Py_DECREF(m);
return 0;
}
if (s->ndim != 2 || (s->type_num != a->type_num)) {
PyErr_SetString(PyExc_TypeError, "output data matrix `s' should be either a 32 or 64-bit float 2D array, matching the data type of `a'");
Py_DECREF(a);
Py_DECREF(s);
Py_DECREF(m);
}
if (m->ndim != 1 || (m->type_num != a->type_num)) {
PyErr_SetString(PyExc_TypeError, "output data vector `m' should be either a 32 or 64-bit float 1D array, matching the data type of `a'");
Py_DECREF(a);
Py_DECREF(s);
Py_DECREF(m);
}
try {
switch (a->type_num) {
case NPY_FLOAT32:
bob::math::scatter(
*PyBlitzArrayCxx_AsBlitz<float,2>(a),
*PyBlitzArrayCxx_AsBlitz<float,2>(s),
*PyBlitzArrayCxx_AsBlitz<float,1>(m)
);
break;
case NPY_FLOAT64:
bob::math::scatter(
*PyBlitzArrayCxx_AsBlitz<double,2>(a),
*PyBlitzArrayCxx_AsBlitz<double,2>(s),
*PyBlitzArrayCxx_AsBlitz<double,1>(m)
);
break;
default:
PyErr_Format(PyExc_TypeError, "(no-check) scatter calculation currently not implemented for type '%s'", PyBlitzArray_TypenumAsString(a->type_num));
}
}
catch (std::exception& e) {
PyErr_SetString(PyExc_RuntimeError, e.what());
}
catch (...) {
PyErr_SetString(PyExc_RuntimeError, "(no-check) scatter calculation failed: unknown exception caught");
}
Py_DECREF(a);
Py_DECREF(s);
Py_DECREF(m);
if (PyErr_Occurred()) return 0;
Py_RETURN_NONE;
}
/**
* Converts the input iterable d into a tuple of PyBlitzArrayObject's. Checks
* each array is 2D and of type NPY_FLOAT32 or NPY_FLOAT64, consistently.
* Returns 0 if a problem occurs, 1 otherwise.
*/
int BzTuple_Converter(PyObject* o, PyObject** a) {
PyObject* tmp = PySequence_Tuple(o);
if (!tmp) return 0;
if (PyTuple_GET_SIZE(o) < 2) {
PyErr_SetString(PyExc_TypeError, "input data object must be a sequence or iterable with at least 2 2D arrays with 32 or 64-bit floats");
Py_DECREF(tmp);
return 0;
}
PyBlitzArrayObject* first = 0;
int status = PyBlitzArray_Converter(PyTuple_GET_ITEM(tmp, 0), &first);
if (!status) {
Py_DECREF(tmp);
return 0;
}
if (first->ndim != 2 ||
(first->type_num != NPY_FLOAT32 && first->type_num != NPY_FLOAT64)) {
PyErr_SetString(PyExc_TypeError, "input data object must be a sequence or iterable with at least 2 2D arrays with 32 or 64-bit floats - the first array does not conform");
Py_DECREF(first);
Py_DECREF(tmp);
}
PyObject* retval = PyTuple_New(PyTuple_GET_SIZE(tmp));
if (!retval) {
Py_DECREF(tmp);
Py_DECREF(first);
return 0;
}
PyTuple_SET_ITEM(retval, 0, (PyObject*)first);
for (Py_ssize_t i=1; i<PyTuple_GET_SIZE(tmp); ++i) {
PyBlitzArrayObject* next = 0;
PyObject* item = PyTuple_GET_ITEM(tmp, i); //borrowed
int status = PyBlitzArray_Converter(item, &next);
if (!status) {
Py_DECREF(retval);
Py_DECREF(tmp);
return 0;
}
if (next->type_num != first->type_num) {
PyErr_Format(PyExc_TypeError, "array at data[%" PY_FORMAT_SIZE_T "d] does not have the same data type as the first array on the sequence (%s != %s)", i, PyBlitzArray_TypenumAsString(next->type_num), PyBlitzArray_TypenumAsString(first->type_num));
Py_DECREF(next);
Py_DECREF(retval);
Py_DECREF(tmp);
return 0;
}
if (next->ndim != 2) {
PyErr_Format(PyExc_TypeError, "array at data[%" PY_FORMAT_SIZE_T "d] does not have two dimensions, but %" PY_FORMAT_SIZE_T "d", i, next->ndim);
Py_DECREF(next);
Py_DECREF(retval);
Py_DECREF(tmp);
return 0;
}
PyTuple_SET_ITEM(retval, i, (PyObject*)next); //steals `next'
}
Py_DECREF(tmp);
*a = retval;
return 1;
}
PyObject* py_scatters (PyObject*, PyObject* args, PyObject* kwds) {
/* Parses input arguments in a single shot */
static const char* const_kwlist[] = { "data", "sw", "sb", "m", 0 /* Sentinel */ };
static char** kwlist = const_cast<char**>(const_kwlist);
PyObject* data = 0;
PyBlitzArrayObject* sw = 0;
PyBlitzArrayObject* sb = 0;
PyBlitzArrayObject* m = 0;
if (!PyArg_ParseTupleAndKeywords(args, kwds, "O&|O&O&O&", kwlist,
&BzTuple_Converter, &data,
&PyBlitzArray_OutputConverter, &sw,
&PyBlitzArray_OutputConverter, &sb,
&PyBlitzArray_OutputConverter, &m
)) return 0;
PyBlitzArrayObject* first = (PyBlitzArrayObject*)PyTuple_GET_ITEM(data, 0);
if (sw && (sw->ndim != 2 || (sw->type_num != first->type_num))) {
PyErr_SetString(PyExc_TypeError, "output data matrix `sw' should be either a 32 or 64-bit float 2D array, matching the data type of `data'");
Py_DECREF(data);
Py_DECREF(sw);
Py_XDECREF(sb);
Py_XDECREF(m);
}
if (sb && (sb->ndim != 2 || (sb->type_num != first->type_num))) {
PyErr_SetString(PyExc_TypeError, "output data matrix `sb' should be either a 32 or 64-bit float 2D array, matching the data type of `data'");
Py_DECREF(data);
Py_XDECREF(sw);
Py_DECREF(sb);
Py_XDECREF(m);
}
if (m && (m->ndim != 1 || (m->type_num != first->type_num))) {
PyErr_SetString(PyExc_TypeError, "output data vector `m' should be either a 32 or 64-bit float 1D array, matching the data type of `data'");
Py_DECREF(data);
Py_XDECREF(sw);
Py_XDECREF(sb);
Py_DECREF(m);
}
// allocates data not passed by the user
bool user_sw = sw;
if (!sw) {
Py_ssize_t sshape[2] = {first->shape[1], first->shape[1]};
sw = (PyBlitzArrayObject*)PyBlitzArray_SimpleNew(first->type_num, 2, sshape);
}
bool user_sb = sb;
if (!sb) {
Py_ssize_t sshape[2] = {first->shape[1], first->shape[1]};
sb = (PyBlitzArrayObject*)PyBlitzArray_SimpleNew(first->type_num, 2, sshape);
}
bool user_m = m;
if (!m) m = (PyBlitzArrayObject*)PyBlitzArray_SimpleNew(first->type_num, 1, &first->shape[1]);
try {
switch (first->type_num) {
case NPY_FLOAT32:
{
std::vector<blitz::Array<float,2>> cxxdata;
for (Py_ssize_t i=1; i<PyTuple_GET_SIZE(data); ++i) {
cxxdata.push_back(*PyBlitzArrayCxx_AsBlitz<float,2>
((PyBlitzArrayObject*)PyTuple_GET_ITEM(data,i)));
bob::math::scatters(cxxdata,
*PyBlitzArrayCxx_AsBlitz<float,2>(sw),
*PyBlitzArrayCxx_AsBlitz<float,2>(sb),
*PyBlitzArrayCxx_AsBlitz<float,1>(m)
);
}
}
break;
case NPY_FLOAT64:
{
std::vector<blitz::Array<double,2>> cxxdata;
for (Py_ssize_t i=1; i<PyTuple_GET_SIZE(data); ++i) {
cxxdata.push_back(*PyBlitzArrayCxx_AsBlitz<double,2>
((PyBlitzArrayObject*)PyTuple_GET_ITEM(data,i)));
bob::math::scatters(cxxdata,
*PyBlitzArrayCxx_AsBlitz<double,2>(sw),
*PyBlitzArrayCxx_AsBlitz<double,2>(sb),
*PyBlitzArrayCxx_AsBlitz<double,1>(m)
);
}
}
break;
default:
PyErr_Format(PyExc_TypeError, "scatters calculation currently not implemented for type '%s'", PyBlitzArray_TypenumAsString(first->type_num));
}
}
catch (std::exception& e) {
PyErr_SetString(PyExc_RuntimeError, e.what());
}
catch (...) {
PyErr_SetString(PyExc_RuntimeError, "scatters calculation failed: unknown exception caught");
}
Py_DECREF(data);
if (PyErr_Occurred()) {
Py_DECREF(sw);
Py_DECREF(sb);
Py_DECREF(m);
return 0;
}
int returns = 3 - (user_sw + user_sb + user_m);
if (!returns) {
Py_DECREF(sw);
Py_DECREF(sb);
Py_DECREF(m);
Py_RETURN_NONE;
}
// if you get here, it means the user is interested on stuff to be returned
PyObject* retval = PyTuple_New(returns);
if (!retval) {
Py_DECREF(sw);
Py_DECREF(sb);
Py_DECREF(m);
}
// fill from the back
if (!user_m) PyTuple_SET_ITEM(--retval, returns, (PyObject*)m);
else Py_DECREF(m);
if (!user_sb) PyTuple_SET_ITEM(--retval, returns, (PyObject*)sb);
else Py_DECREF(sb);
if (!user_sw) PyTuple_SET_ITEM(--retval, returns, (PyObject*)sw);
else Py_DECREF(sw);
return retval;
}
PyObject* py_scatters_nocheck (PyObject*, PyObject* args, PyObject* kwds) {
/* Parses input arguments in a single shot */
static const char* const_kwlist[] = { "data", "sw", "sb", "m", 0 /* Sentinel */ };
static char** kwlist = const_cast<char**>(const_kwlist);
PyObject* data = 0;
PyBlitzArrayObject* sw = 0;
PyBlitzArrayObject* sb = 0;
PyBlitzArrayObject* m = 0;
if (!PyArg_ParseTupleAndKeywords(args, kwds, "O&O&O&O&", kwlist,
&BzTuple_Converter, &data,
&PyBlitzArray_OutputConverter, &sw,
&PyBlitzArray_OutputConverter, &sb,
&PyBlitzArray_OutputConverter, &m
)) return 0;
PyBlitzArrayObject* first = (PyBlitzArrayObject*)PyTuple_GET_ITEM(data, 0);
if (sw->ndim != 2 || (sw->type_num != first->type_num)) {
PyErr_SetString(PyExc_TypeError, "output data matrix `sw' should be either a 32 or 64-bit float 2D array, matching the data type of `data'");
Py_DECREF(data);