This is a generic issue that I am raising that I believe we will face moving forward. The biggest issue that I have found with our sample-based approach is when you have concatenated samples to make a big array for processing steps such as .fit methods. The reason for this is that we are looking at samples individually, even though they might have come from a bigger array. Let me demonstrate this with an example:

In [1]:
import numpy as np
import bob.pipelines as mario
import sklearn
In [2]:
# 100 samples with 40 features
X = np.arange(100*40).reshape((100, 40))
y = np.random.random_sample(100) >= 0.5
X, y
Out[2]:
(array([[   0,    1,    2, ...,   37,   38,   39],
        [  40,   41,   42, ...,   77,   78,   79],
        [  80,   81,   82, ...,  117,  118,  119],
        ...,
        [3880, 3881, 3882, ..., 3917, 3918, 3919],
        [3920, 3921, 3922, ..., 3957, 3958, 3959],
        [3960, 3961, 3962, ..., 3997, 3998, 3999]]),
 array([ True,  True, False,  True,  True, False,  True,  True, False,
        False, False,  True,  True,  True, False,  True, False,  True,
        False,  True, False,  True, False,  True,  True, False, False,
        False, False, False,  True, False, False,  True, False,  True,
         True,  True, False, False,  True, False, False, False, False,
         True, False,  True,  True, False, False,  True,  True, False,
        False,  True, False, False,  True,  True, False, False, False,
        False,  True, False,  True, False,  True,  True,  True, False,
         True, False,  True, False, False, False, False,  True, False,
         True, False, False, False,  True, False,  True, False, False,
        False,  True, False, False,  True, False, False, False,  True,
        False]))
In [3]:
pca = sklearn.decomposition.PCA(n_components=20)
svm = sklearn.svm.SVC()
pipeline = sklearn.pipeline.make_pipeline(pca, svm)
pipeline
Out[3]:
Pipeline(memory=None,
         steps=[('pca',
                 PCA(copy=True, iterated_power='auto', n_components=20,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('svc',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,
                     gamma='scale', kernel='rbf', max_iter=-1,
                     probability=False, random_state=None, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False)
In [4]:
pipeline.fit(X, y)
svm.support_vectors_
Out[4]:
array([[ 1.20166551e+04, -9.35516143e-13,  5.87253433e-12, ...,
         1.00746122e-12, -3.80809747e-13,  3.03746414e-14],
       [ 1.12577085e+04,  1.19346357e-13, -2.46621521e-13, ...,
        -4.52647333e-13, -2.00449386e-13, -1.44718585e-13],
       [ 1.04987618e+04,  1.66745243e-13, -1.30053550e-13, ...,
         1.06435309e-12, -1.94217461e-13,  4.21341284e-13],
       ...,
       [-1.04987618e+04, -1.66745243e-13,  1.30053550e-13, ...,
         1.97410737e-14,  7.19386881e-14,  6.11711358e-14],
       [-1.12577085e+04, -1.19346357e-13,  2.46621521e-13, ...,
        -2.55088254e-14,  9.42937343e-14,  1.11043942e-13],
       [-1.22696373e+04, -1.15612834e-12, -2.94314538e-13, ...,
         2.15438383e-12, -4.43591364e-13, -2.30443497e-12]])
In [5]:
# what happenned when we called pipeline.fit?
# the follwoing steps ran:
pca.fit(X, y)
Xt = pca.transform(X)
svm.fit(Xt, y)
svm.support_vectors_
Out[5]:
array([[ 1.20166551e+04,  6.66355859e-13, -4.41785497e-13, ...,
        -2.27373675e-13, -1.13686838e-13, -3.41060513e-13],
       [ 1.12577085e+04,  4.15445456e-13, -4.43950432e-13, ...,
        -3.41060513e-13,  1.02318154e-12, -2.27373675e-13],
       [ 1.04987618e+04,  6.12176976e-13, -3.96377375e-13, ...,
         6.82121026e-13, -2.27373675e-13, -1.13686838e-13],
       ...,
       [-1.04987618e+04, -6.12176976e-13,  3.96377375e-13, ...,
        -6.82121026e-13,  2.27373675e-13,  1.13686838e-13],
       [-1.12577085e+04, -4.15445456e-13,  4.43950432e-13, ...,
         3.41060513e-13, -1.02318154e-12,  2.27373675e-13],
       [-1.22696373e+04, -1.36424205e-12,  4.54747351e-13, ...,
         4.54747351e-13,  0.00000000e+00,  9.09494702e-13]])
In [6]:
# now let's look at the same process but now with our SampleWrapper and Sample classes
sample_pipeline = mario.wrap(["sample"], pipeline, fit_extra_arguments=[("y", "y")])
sample_pipeline
Out[6]:
Pipeline(memory=None,
         steps=[('pca',
                 SampleWrapper(estimator=PCA(copy=True, iterated_power='auto',
                                             n_components=20, random_state=None,
                                             svd_solver='auto', tol=0.0,
                                             whiten=False),
                               fit_extra_arguments=[('y', 'y')],
                               transform_extra_arguments=())),
                ('svc',
                 SampleWrapper(estimator=SVC(C=1.0, break_ties=False,
                                             cache_size=200, class_weight=None,
                                             coef0=0.0,
                                             decision_function_shape='ovr',
                                             degree=3, gamma='scale',
                                             kernel='rbf', max_iter=-1,
                                             probability=False,
                                             random_state=None, shrinking=True,
                                             tol=0.001, verbose=False),
                               fit_extra_arguments=[('y', 'y')],
                               transform_extra_arguments=()))],
         verbose=False)
In [7]:
samples = [mario.Sample(x, y=y1) for x, y1 in zip(X, y)]
samples[:2]
Out[7]:
[Sample(data=array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39]), y=True),
 Sample(data=array([40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56,
        57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73,
        74, 75, 76, 77, 78, 79]), y=True)]
In [8]:
sample_pipeline.fit(samples, y=None)
Out[8]:
Pipeline(memory=None,
         steps=[('pca',
                 SampleWrapper(estimator=PCA(copy=True, iterated_power='auto',
                                             n_components=20, random_state=None,
                                             svd_solver='auto', tol=0.0,
                                             whiten=False),
                               fit_extra_arguments=[('y', 'y')],
                               transform_extra_arguments=())),
                ('svc',
                 SampleWrapper(estimator=SVC(C=1.0, break_ties=False,
                                             cache_size=200, class_weight=None,
                                             coef0=0.0,
                                             decision_function_shape='ovr',
                                             degree=3, gamma='scale',
                                             kernel='rbf', max_iter=-1,
                                             probability=False,
                                             random_state=None, shrinking=True,
                                             tol=0.001, verbose=False),
                               fit_extra_arguments=[('y', 'y')],
                               transform_extra_arguments=()))],
         verbose=False)
In [17]:
# what happenned when we called sample_pipeline.fit?
# the follwoing steps ran:

def get_X_y(samples, fit_extra_arguments):
    # stack samples to make X
    X = mario.utils.vstack_features(lambda s: s.data[None, ...], samples, same_size=True)
    # get back y
    y = mario.wrappers._make_kwargs_from_samples(samples, fit_extra_arguments)["y"]
    y = np.asarray(y)
    return X, y

X, y = get_X_y(samples, sample_pipeline.named_steps["pca"].fit_extra_arguments)
X.shape, X.dtype, y.shape, y.dtype
Out[17]:
((100, 40), dtype('int64'), (100,), dtype('bool'))
In [18]:
# stack X, y for pca.fit
pca.fit(X, y)
Out[18]:
PCA(copy=True, iterated_power='auto', n_components=20, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)
In [19]:
# stack X, y AGAIN for .transform
X, y = get_X_y(samples, sample_pipeline.named_steps["pca"].fit_extra_arguments)
Xt = pca.transform(X)
Xt.shape, Xt.dtype
Out[19]:
((100, 20), dtype('float64'))
In [20]:
# Stack Xt and y for svm.fit
xt_samples = [mario.Sample(xt, parent=s) for xt, s in zip(Xt, samples)]
Xt, y = get_X_y(xt_samples, sample_pipeline.named_steps["svc"].fit_extra_arguments)
svm.fit(Xt, y)
Out[20]:
SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
In [ ]: