入门NuPIC

Encoders

ScalarEncoder

1
2
3
4
5
6
7
8
9
10
11
12
from nupic.encoders import ScalarEncoder
se = ScalarEncoder(n=22, w=3, minval=0, maxval=10, forced=True, clipInput=True)
# 等价于:
# se = ScalarEncoder(w=3, minval=0, maxval=10, radius=1.5, forced=True, clipInput=True)

print se.encode(3) #[0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
print se.encode(4) #[0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0]
print se.encode(5) #[0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0]
print se.encode(6) #[0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0]
print se.encode(7) #[0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0]
print se.encode(100) #[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1]
print se.encode(1000) #[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1]

ScalarEncoder 很好的模拟了像耳蜗这样的生物器官,数据编码是有范围的也就是说大于或小于这个范围的数据是不可用的。
当我们创建这种编码器时,我们首先将值的范围划分为buckets,然后映射活动的cell的位置。一般的算法步骤是:

  1. 计算数据的范围range = maxValue - minValue
  2. 选择一些你要拆分值的buckets数;它的值依赖于你的应用中固有噪声和期望的预测质量来选择大小。
  3. 选择每次表示活动位的数量w
  4. 计算总共需要的位数n = buckets + w - 1
  5. 给定一个值,确定起始索引位i = floor[buckets*(v-minValue)/range]

至于上面两个ScalarEncoder参数不同但结为什么相同是基于下面算法公式得来的:

  1. range = maxval - minval
  2. h = (w-1)/2 (half-width)
  3. resolution = radius / w
  4. n = w * range/radius (periodic)或n = w * range/radius + 2 * h (non-periodic)
    因此己知n, w, range可以求出radius = 1.5resolution = 0.5

参数介绍

  • n编码输出的比特数,必须大于或等于w
  • w编码中为1的比特数,它必须为奇数;
  • minval编码的数据可识别最小值;
  • maxval编码的数据可识别最大值;
  • forced如果为true,将会跳过一些安全检测,默认为false
  • clipInput如果设置为true,在最大值和最小值范围外编码取最大或最小值的编码;如果为false,则会报错;默认为false
  • resolution编码器能编码输入数据的刻度,也就是输入数据的分辨率。例如设置为1,则表示数值2和2.2编码出来将是一样的表征,而输入数值差为1时才编码出来的不一样;
  • radius当两个输入的差值小于radius时,编码出来的位总会有一些位重叠;当两个输入的差值大于radius时,编码出来的位不会有重叠位;
    注意:radiusresolution是指关于输入,w是指关于输出。

RandomDistributedScalarEncoder

1
2
3
4
5
6
rdse = RandomDistributedScalarEncoder(n=21, w=3, resolution=1, offset=2)

print rdse.encode(3) #[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1]
print rdse.encode(4) #[0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0]
print rdse.encode(5) #[0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0]
print rdse.encode(6) #[0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]

RandomDistributedScalarEncoderScalarEncoder 类似都是编码数值的,但是它是ScalarEncoder的升级版本,它能动态增加大小,只需要设置参数resolution。由于该编码器是随机映身buckets位置,所以虽然参数相同但要求也有些区别

  • n官方要求其值最少要大于6*w,而最合适的值是大于11*w
  • offset用于计算映射bucket索引位置,通常取[offset - resolution/2, offset + resolution/2)
  • seed随机分布的固定因子,如果设置为-1则表示没有该因子

DateEncoder

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
from datetime import datetime
from nupic.encoders import DateEncoder
de = DateEncoder(season=5, dayOfWeek=7, weekend=1, timeOfDay=23, forced=True)
print de.encode(datetime.strptime("2018-02-01 00:00:00", "%Y-%m-%d %H:%M:%S"))
# [1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1
# 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1]
print de.encode(datetime.strptime("2018-02-02 00:00:00", "%Y-%m-%d %H:%M:%S"))
# [1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1
# 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1]
print de.encode(datetime.strptime("2018-02-02 01:00:00", "%Y-%m-%d %H:%M:%S"))
# [1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1
# 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1]
print de.encode(datetime.strptime("2018-03-01 00:00:00", "%Y-%m-%d %H:%M:%S"))
# [0 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1
# 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1]

CategoryEncoder

1
2
3
4
5
6
7
8
9
10
11
from nupic.encoders.category import CategoryEncoder
category = ['cat', 'dog', 'monkey', 'human']
ce = CategoryEncoder(w=3, categoryList=category, forced=True)
print ce.encode('cat') #[0 0 0 1 1 1 0 0 0 0 0 0 0 0 0]
print ce.encode('dog') #[0 0 0 0 0 0 1 1 1 0 0 0 0 0 0]
print ce.encode('monkey') #[0 0 0 0 0 0 0 0 0 1 1 1 0 0 0]
print ce.encode('human') #[0 0 0 0 0 0 0 0 0 0 0 0 1 1 1]
print ce.encode('unkown') #[1 1 1 0 0 0 0 0 0 0 0 0 0 0 0]

cat_unkown = numpy.array([1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0])
print ce.decode(cat_unkown) #({'category': ([(0, 2)], '<UNKNOWN>, cat, dog')}, ['category'])

Spatial Pooler

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
from nupic.algorithms.spatial_pooler import SpatialPooler
sp = SpatialPooler(
inputDimensions=(15,),
columnDimensions=(4,),
potentialRadius=15,
numActiveColumnsPerInhArea=1,
globalInhibition=True,
synPermActiveInc=0.03,
potentialPct=1
)
for column in range(4):
connected = numpy.zeros((15,), dtype='int')
sp.getConnectedSynapses(column, connected)
print(connected)
# Output:
# [1 0 1 1 0 0 1 0 0 1 0 0 1 0 0]
# [0 0 0 1 1 0 0 1 1 0 1 1 0 0 0]
# [0 1 0 0 1 1 1 0 1 0 1 1 0 1 0]
# [1 0 0 1 1 0 0 1 1 1 1 0 1 0 1]

注意,columnsDimensions >= columns

1
2
3
4
5
6
7
output = numpy.zeros((4,), dtype='int')
noisyCat = numpy.zeros((15,), dtype='int')
noisyCat[3] = 1
noisyCat[4] = 1
noisyCat[6] = 1
sp.compute(noisyCat, learn=False, activeArray=output)
print(output)

Temporal Memory

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import numpy
from nupic.algorithms.backtracking_tm import BacktrackingTM
tm = BacktrackingTM(numberOfCols=50, cellsPerColumn=2, initialPerm=0.5, connectedPerm=0.5,
minThreshold=10, newSynapseCount=10, permanenceInc=0.1, permanenceDec=0.0, activationThreshold=8,
globalDecay=0, burnIn=1, checkSynapseConsistency=False, pamLength=10)
x = numpy.zeros((5, tm.numberOfCols), dtype="uint32")
x[0, 0:10] = 1
x[1, 10:20] = 1
x[2, 20:30] = 1
x[3, 30:40] = 1
x[4, 40:50] = 1

def formatRow(x):
s = ''
for c in range(len(x)):
if c > 0 and c % 10 == 0:
s += ' '
s += str(x[c])
s += ' '
return s

for i in range(10):
for j in range(5):
tm.compute(x[j], enableLearn=True, enableInference=False)
tm.reset()

for j in range(5):
print('-'*40+'ABCDE'[j]+'-'*40)
print('原始数据: {}'.format(formatRow(x[j])))
tm.compute(x[j], enableLearn=False, enableInference=True)
print('己激活和预测的细胞:')
tm.printStates(printPrevious=False, printLearnState=False)
print('下面的列是tm预测激活细胞:')
predictedCells = tm.getPredictedState()
print(formatRow(predictedCells.max(axis=1).nonzero()))
print('\n')

输出

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
----------------------------------------A----------------------------------------
原始数据: 1111111111 0000000000 0000000000 0000000000 0000000000
己激活和预测的细胞:

Inference Active state
1111111111 0000000000 0000000000 0000000000 0000000000
0000000000 0000000000 0000000000 0000000000 0000000000
Inference Predicted state
0000000000 0000000000 0000000000 0000000000 0000000000
0000000000 1111111111 0000000000 0000000000 0000000000
下面的列是tm预测激活细胞:
[10 11 12 13 14 15 16 17 18 19]


----------------------------------------B----------------------------------------
原始数据: 0000000000 1111111111 0000000000 0000000000 0000000000
己激活和预测的细胞:

Inference Active state
0000000000 0000000000 0000000000 0000000000 0000000000
0000000000 1111111111 0000000000 0000000000 0000000000
Inference Predicted state
0000000000 0000000000 0000000000 0000000000 0000000000
0000000000 0000000000 1111111111 0000000000 0000000000
下面的列是tm预测激活细胞:
[20 21 22 23 24 25 26 27 28 29]


----------------------------------------C----------------------------------------
原始数据: 0000000000 0000000000 1111111111 0000000000 0000000000
己激活和预测的细胞:

Inference Active state
0000000000 0000000000 0000000000 0000000000 0000000000
0000000000 0000000000 1111111111 0000000000 0000000000
Inference Predicted state
0000000000 0000000000 0000000000 0000000000 0000000000
0000000000 0000000000 0000000000 1111111111 0000000000
下面的列是tm预测激活细胞:
[30 31 32 33 34 35 36 37 38 39]


----------------------------------------D----------------------------------------
原始数据: 0000000000 0000000000 0000000000 1111111111 0000000000
己激活和预测的细胞:

Inference Active state
0000000000 0000000000 0000000000 0000000000 0000000000
0000000000 0000000000 0000000000 1111111111 0000000000
Inference Predicted state
0000000000 0000000000 0000000000 0000000000 0000000000
0000000000 0000000000 0000000000 0000000000 1111111111
下面的列是tm预测激活细胞:
[40 41 42 43 44 45 46 47 48 49]


----------------------------------------E----------------------------------------
原始数据: 0000000000 0000000000 0000000000 0000000000 1111111111
己激活和预测的细胞:

Inference Active state
0000000000 0000000000 0000000000 0000000000 0000000000
0000000000 0000000000 0000000000 0000000000 1111111111
Inference Predicted state
0000000000 0000000000 0000000000 0000000000 0000000000
0000000000 0000000000 0000000000 0000000000 0000000000
下面的列是tm预测激活细胞:
[]

Prediction

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
from nupic.data.file_record_stream import FileRecordStream
from pkg_resources import resource_filename
from nupic.frameworks.opf.model_factory import ModelFactory

MODEL_PARAMS_Prediction = {
'model': "HTMPrediction",
'version': 1,
'aggregationInfo': {
'days': 0,
'fields': [('consumption', 'sum')],
'hours': 1,
'microseconds': 0,
'milliseconds': 0,
'minutes': 0,
'months': 0,
'seconds': 0,
'weeks': 0,
'years': 0
},
'predictAheadTime': None,
'modelParams': {
'inferenceType': 'TemporalMultiStep',
'sensorParams': {
'verbosity' : 0,
'encoders': {
u'timestamp_timeOfDay': {
'fieldname': u'timestamp',
'name': u'timestamp_timeOfDay',
'timeOfDay': (21, 0.5),
'type': 'DateEncoder'
},
u'timestamp_dayOfWeek': None,
u'timestamp_weekend': None,
u'consumption': {
'clipInput': True,
'fieldname': u'consumption',
'maxval': 100.0,
'minval': 0.0,
'n': 50,
'name': u'c1',
'type': 'ScalarEncoder',
'w': 21
},
},
'sensorAutoReset' : None,
},
'spEnable': True,
'spParams': {
'spVerbosity' : 0,
'spatialImp' : 'cpp',
'globalInhibition': 1,
'columnCount': 2048,
'inputWidth': 0,
'numActiveColumnsPerInhArea': 40,
'seed': 1956,
'potentialPct': 0.5,
'synPermConnected': 0.1,
'synPermActiveInc': 0.1,
'synPermInactiveDec': 0.005,
},
'tmEnable' : True,
'tmParams': {
'verbosity': 0,
'columnCount': 2048,
'cellsPerColumn': 32,
'inputWidth': 2048,
'seed': 1960,
'temporalImp': 'cpp',
'newSynapseCount': 20,
'maxSynapsesPerSegment': 32,
'maxSegmentsPerCell': 128,
'initialPerm': 0.21,
'permanenceInc': 0.1,
'permanenceDec' : 0.1,
'globalDecay': 0.0,
'maxAge': 0,
'minThreshold': 9,
'activationThreshold': 12,
'outputType': 'normal',
'pamLength': 1,
},

'clParams': {
'regionName' : 'SDRClassifierRegion',
'verbosity' : 0,
'alpha': 0.005,
'steps': '1,5',

'implementation': 'cpp',
},

'trainSPNetOnlyIfRequested': False,
},
}

def getData():
filePath = resource_filename('nupic.datafiles', 'extra/hotgym/hotgym.csv')
return FileRecordStream(filePath)

model = ModelFactory.create(MODEL_PARAMS)
model.enableInference({'predictedField': 'consumption'})
data = getData()
for _ in range(10):
record = dict(zip(data.getFieldNames(), data.next()))
result = model.run(record)
print('input: {}, prediction: {}'.format(record['consumption'], result.inferences['multiStepBestPredictions'][1]))

# console 输出:
# input: 5.3, prediction: 5.3
# input: 5.5, prediction: 5.5
# input: 5.1, prediction: 5.36
# input: 5.3, prediction: 5.1
# input: 5.2, prediction: 5.342
# input: 5.5, prediction: 5.2994
# input: 4.5, prediction: 5.35958
# input: 1.2, prediction: 4.92
# input: 1.1, prediction: 1.2
# input: 1.2, prediction: 1.17

AnomalyScore

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
MODEL_PARAMS_Anomaly = {
'model': "HTMPrediction",
'version': 1,
'aggregationInfo': {
'days': 0,
'fields': [('consumption', 'sum')],
'hours': 1,
'microseconds': 0,
'milliseconds': 0,
'minutes': 0,
'months': 0,
'seconds': 0,
'weeks': 0,
'years': 0
},
'predictAheadTime': None,
'modelParams': {
'inferenceType': 'TemporalAnomaly',
'sensorParams': {
'verbosity' : 0,
'encoders': {
u'timestamp_timeOfDay': {
'fieldname': u'timestamp',
'name': u'timestamp_timeOfDay',
'timeOfDay': (21, 0.5),
'type': 'DateEncoder'},
u'timestamp_dayOfWeek': None,
u'timestamp_weekend': None,
u'consumption': {
'clipInput': True,
'fieldname': u'consumption',
'maxval': 100.0,
'minval': 0.0,
'n': 50,
'name': u'c1',
'type': 'ScalarEncoder',
'w': 21},},
'sensorAutoReset' : None,
},
'spEnable': True,
'spParams': {
'spVerbosity' : 0,
'spatialImp' : 'cpp',
'globalInhibition': 1,
'columnCount': 2048,
'inputWidth': 0,
'numActiveColumnsPerInhArea': 40,
'seed': 1956,
'potentialPct': 0.5,
'synPermConnected': 0.1,
'synPermActiveInc': 0.1,
'synPermInactiveDec': 0.005,
},
'tmEnable' : True,
'tmParams': {
'verbosity': 0,
'columnCount': 2048,
'cellsPerColumn': 32,
'inputWidth': 2048,
'seed': 1960,
'temporalImp': 'cpp',
'newSynapseCount': 20,
'maxSynapsesPerSegment': 32,
'maxSegmentsPerCell': 128,
'initialPerm': 0.21,
'permanenceInc': 0.1,
'permanenceDec' : 0.1,
'globalDecay': 0.0,
'maxAge': 0,
'minThreshold': 9,
'activationThreshold': 12,
'outputType': 'normal',
'pamLength': 1,
},
'clParams': {
'regionName' : 'SDRClassifierRegion',
'verbosity' : 0,
'alpha': 0.005,
'steps': '1',
'implementation': 'cpp',
},
'anomalyParams': {
u'anomalyCacheRecords': None,
u'autoDetectThreshold': None,
u'autoDetectWaitRecords': 2184
},
'trainSPNetOnlyIfRequested': False,
},
}

def getData():
filePath = resource_filename('nupic.datafiles', 'extra/hotgym/hotgym.csv')
return FileRecordStream(filePath)

model = ModelFactory.create(MODEL_PARAMS_Anomaly)
model.enableInference({'predictedField': 'consumption'})

data = getData()
for _ in xrange(10):
record = dict(zip(data.getFieldNames(), data.next()))
result = model.run(record)
print('input: {}, prediction: {}, anomalyScore: {}'.format(record["consumption"], result.inferences["multiStepBestPredictions"][1], result.inferences['anomalyScore']))

# console输出:
# input: 5.3, prediction: 5.3, anomalyScore: 1.0
# input: 5.5, prediction: 5.5, anomalyScore: 1.0
# input: 5.1, prediction: 5.36, anomalyScore: 1.0
# input: 5.3, prediction: 5.1, anomalyScore: 0.449999988079
# input: 5.2, prediction: 5.342, anomalyScore: 0.40000000596
# input: 5.5, prediction: 5.2994, anomalyScore: 0.17499999702
# input: 4.5, prediction: 5.35958, anomalyScore: 0.0750000029802
# input: 1.2, prediction: 4.92, anomalyScore: 0.125
# input: 1.1, prediction: 1.2, anomalyScore: 0.17499999702
# input: 1.2, prediction: 1.17, anomalyScore: 0.10000000149

Swarming