# others - Numpy: 如何将数据集(数组) 分割为训练和测试数据集，例如交叉验证？

``````
import numpy
x = numpy.random.rand(100, 5)
numpy.random.shuffle(x)
training, test = x[:80,:], x[80:,:]

``````

``````
import numpy
x = numpy.random.rand(100, 5)
indices = numpy.random.permutation(x.shape[0])
training_idx, test_idx = indices[:80], indices[80:]
training, test = x[training_idx,:], x[test_idx,:]

``````

``````
import numpy
x = numpy.random.rand(100, 5)
training_idx = numpy.random.randint(x.shape[0], size=80)
test_idx = numpy.random.randint(x.shape[0], size=20)
training, test = x[training_idx,:], x[test_idx,:]

``````

``````
from sklearn.cross_validation import train_test_split

data, labels = np.arange(10).reshape((5, 2)), range(5)

data_train, data_test, labels_train, labels_test = train_test_split(data, labels, test_size=0.20, random_state=42)

``````

``````
def partition(seq, chunks):
"""Splits the sequence into equal sized chunks and them as a list"""
result = []
for i in range(chunks):
chunk = []
for element in seq[i:len(seq):chunks]:
chunk.append(element)
result.append(chunk)
return result

``````

``````
import numpy as np

def get_train_test_inds(y,train_proportion=0.7):
'''Generates indices, making random stratified split into training set and testing sets
with proportions train_proportion and (1-train_proportion) of initial sample.
y is any iterable indicating classes of each observation in the sample.
Initial proportions of classes inside training and
testing sets are preserved (stratified sampling).
'''

y=np.array(y)
train_inds = np.zeros(len(y),dtype=bool)
test_inds = np.zeros(len(y),dtype=bool)
values = np.unique(y)
for value in values:
value_inds = np.nonzero(y==value)[0]
np.random.shuffle(value_inds)
n = int(train_proportion*len(value_inds))

train_inds[value_inds[:n]]=True
test_inds[value_inds[n:]]=True

return train_inds,test_inds

y = np.array([1,1,2,2,3,3])
train_inds,test_inds = get_train_test_inds(y,train_proportion=0.5)
print y[train_inds]
print y[test_inds]

``````

``````
[1 2 3]
[1 2 3]

``````