利用python进入数据分析之Numpy基础知识

导入相关包

In [182]:
%matplotlib inline
from __future__ import division
from numpy.random import randn
import numpy as np
np.set_printoptions(precision=4, suppress=True)

NumPy 的ndarray: 一种多维数组对象

In [2]:
data = randn(2, 3)  # 两行三列的数组,
In [4]:
data
Out[4]:
array([[-1.3219,  0.1149, -0.4221],
       [-1.7955, -1.1645, -0.7282]])
In [5]:
data * 10
Out[5]:
array([[-13.2186,   1.1495,  -4.221 ],
       [-17.9554, -11.6451,  -7.2818]])
In [6]:
data + data
Out[6]:
array([[-2.6437,  0.2299, -0.8442],
       [-3.5911, -2.329 , -1.4564]])
In [7]:
data.shape # 数据维度
Out[7]:
(2, 3)
In [8]:
data.dtype # 数据类型
Out[8]:
dtype('float64')

创建ndarray

In [9]:
data1 = [6, 7.5, 8, 0, 1]
arr1 = np.array(data1)
arr1
Out[9]:
array([ 6. ,  7.5,  8. ,  0. ,  1. ])
In [11]:
data2 = [[1, 2, 3, 4], [5, 6, 7, 8]]
arr2 = np.array(data2)
arr2
Out[11]:
array([[1, 2, 3, 4],
       [5, 6, 7, 8]])
In [12]:
arr2.ndim
Out[12]:
2
In [13]:
arr2.shape
Out[13]:
(2, 4)
In [14]:
arr1.dtype
Out[14]:
dtype('float64')
In [15]:
arr2.dtype
Out[15]:
dtype('int32')
In [16]:
np.zeros(10) # 创建全0数组
Out[16]:
array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])
In [17]:
np.zeros((3, 6))
Out[17]:
array([[ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.]])
In [18]:
np.empty((2, 3, 2)) # 不安全的用法
Out[18]:
array([[[ 0.,  0.],
        [ 0.,  0.],
        [ 0.,  0.]],

       [[ 0.,  0.],
        [ 0.,  0.],
        [ 0.,  0.]]])
In [19]:
np.arange(15) #创建数组,按照顺序
Out[19]:
array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

ndarrays的数据类型

In [20]:
arr1 = np.array([1, 2, 3], dtype=np.float64)
arr2 = np.array([1, 2, 3], dtype=np.int32)
In [21]:
arr1.dtype
Out[21]:
dtype('float64')
In [22]:
arr2.dtype
Out[22]:
dtype('int32')
In [23]:
arr = np.array([1, 2, 3, 4, 5])
In [24]:
arr.dtype
Out[24]:
dtype('int32')
In [25]:
float_arr = arr.astype(np.float64)  # 整数类型转换浮点数类型
In [26]:
float_arr.dtype
Out[26]:
dtype('float64')
In [27]:
arr = np.array([3.7, -1.2, -2.6, 0.5, 12.9, 10.1])
In [28]:
arr
Out[28]:
array([  3.7,  -1.2,  -2.6,   0.5,  12.9,  10.1])
In [29]:
arr.astype(np.int32)  #浮点数转换成整数
Out[29]:
array([ 3, -1, -2,  0, 12, 10])
In [30]:
numeric_strings = np.array(['1.25', '-9.6', '42'], dtype=np.string_)
numeric_strings.astype(float)  # 字符串转换成浮点数
Out[30]:
array([  1.25,  -9.6 ,  42.  ])
In [31]:
int_array = np.arange(10)
calibers = np.array([.22, .270, .357, .380, .44, .50], dtype=np.float64)
int_array.astype(calibers.dtype)
Out[31]:
array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9.])
In [32]:
empty_uint32 = np.empty(8, dtype='u4')
empty_uint32
Out[32]:
array([0, 0, 0, 0, 0, 0, 0, 0], dtype=uint32)

数组和标量之前的运算

In [33]:
arr = np.array([[1., 2., 3.], [4., 5., 6.]])
In [34]:
arr
Out[34]:
array([[ 1.,  2.,  3.],
       [ 4.,  5.,  6.]])
In [35]:
arr * arr
Out[35]:
array([[  1.,   4.,   9.],
       [ 16.,  25.,  36.]])
In [36]:
arr - arr
Out[36]:
array([[ 0.,  0.,  0.],
       [ 0.,  0.,  0.]])
In [37]:
1 / arr
Out[37]:
array([[ 1.    ,  0.5   ,  0.3333],
       [ 0.25  ,  0.2   ,  0.1667]])
In [38]:
arr ** 0.5
Out[38]:
array([[ 1.    ,  1.4142,  1.7321],
       [ 2.    ,  2.2361,  2.4495]])

基本的索引和切片

In [39]:
arr = np.arange(10)
In [40]:
arr
Out[40]:
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
In [41]:
arr[5]
Out[41]:
5
In [42]:
arr[5:8]
Out[42]:
array([5, 6, 7])
In [43]:
arr[5:8] = 12 # 将5到8赋值为12
In [44]:
arr
Out[44]:
array([ 0,  1,  2,  3,  4, 12, 12, 12,  8,  9])
In [45]:
arr_slice = arr[5:8]
In [46]:
arr_slice[1] = 12345
In [47]:
arr
Out[47]:
array([    0,     1,     2,     3,     4,    12, 12345,    12,     8,     9])
In [49]:
arr_slice[:] = 64
In [50]:
arr
Out[50]:
array([ 0,  1,  2,  3,  4, 64, 64, 64,  8,  9])
In [51]:
arr2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
arr2d[2]
Out[51]:
array([7, 8, 9])
In [53]:
arr2d[0][2]
Out[53]:
3
In [54]:
arr2d[0, 2]
Out[54]:
3
In [55]:
arr3d = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]])
arr3d
Out[55]:
array([[[ 1,  2,  3],
        [ 4,  5,  6]],

       [[ 7,  8,  9],
        [10, 11, 12]]])
In [56]:
arr3d[0]
Out[56]:
array([[1, 2, 3],
       [4, 5, 6]])
In [57]:
old_values = arr3d[0].copy()
arr3d[0] = 42
arr3d
Out[57]:
array([[[42, 42, 42],
        [42, 42, 42]],

       [[ 7,  8,  9],
        [10, 11, 12]]])
In [58]:
arr3d[0] = old_values
arr3d
Out[58]:
array([[[ 1,  2,  3],
        [ 4,  5,  6]],

       [[ 7,  8,  9],
        [10, 11, 12]]])
In [59]:
arr3d[1, 0]
Out[59]:
array([7, 8, 9])

切片索引

In [60]:
arr[1:6]
Out[60]:
array([ 1,  2,  3,  4, 64])
In [61]:
arr2d
Out[61]:
array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])
In [62]:
arr2d[:2]
Out[62]:
array([[1, 2, 3],
       [4, 5, 6]])
In [63]:
arr2d[:2, 1:]
Out[63]:
array([[2, 3],
       [5, 6]])
In [64]:
arr2d[1, :2]
Out[64]:
array([4, 5])
In [65]:
arr2d[2, :1]
Out[65]:
array([7])
In [66]:
arr2d[:, :1]
Out[66]:
array([[1],
       [4],
       [7]])
In [67]:
arr2d[:2, 1:] = 0
In [68]:
arr2d
Out[68]:
array([[1, 0, 0],
       [4, 0, 0],
       [7, 8, 9]])

布尔型索引

In [69]:
names = np.array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'])
data = randn(7, 4) # 生成正太分布的随机数据
In [70]:
names
Out[70]:
array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'], 
      dtype='|S4')
In [71]:
data
Out[71]:
array([[ 0.1754, -1.0822, -0.2513,  0.4978],
       [-1.7729,  1.6647, -1.2513,  0.432 ],
       [ 0.887 , -1.2095,  1.4749,  1.1566],
       [ 0.0033,  1.1249,  1.4727, -0.2798],
       [-1.213 ,  0.2127,  2.3372,  0.6105],
       [ 1.1653, -0.6228, -0.1151,  2.0755],
       [-1.136 ,  0.0232, -1.7908,  1.0971]])
In [72]:
names == 'Bob'
Out[72]:
array([ True, False, False,  True, False, False, False], dtype=bool)
In [81]:
data[names == 'Bob'] # 相当于只要第一行和第三行数据
Out[81]:
array([[ 0.1754, -1.0822, -0.2513,  0.4978],
       [ 0.0033,  1.1249,  1.4727, -0.2798]])
In [86]:
data[names == 'Bob', 2:]
Out[86]:
array([[-0.2513,  0.4978],
       [ 1.4727, -0.2798]])
In [87]:
data[names == 'Bob', 3]
Out[87]:
array([ 0.4978, -0.2798])
In [88]:
names != 'Bob'
Out[88]:
array([False,  True,  True, False,  True,  True,  True], dtype=bool)
In [89]:
data[-(names == 'Bob')]
D:\python2713\lib\anaconda_install\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning: numpy boolean negative, the `-` operator, is deprecated, use the `~` operator or the logical_not function instead.
  """Entry point for launching an IPython kernel.
Out[89]:
array([[-1.7729,  1.6647, -1.2513,  0.432 ],
       [ 0.887 , -1.2095,  1.4749,  1.1566],
       [-1.213 ,  0.2127,  2.3372,  0.6105],
       [ 1.1653, -0.6228, -0.1151,  2.0755],
       [-1.136 ,  0.0232, -1.7908,  1.0971]])
In [90]:
mask = (names == 'Bob') | (names == 'Will')
mask
data[mask]
Out[90]:
array([[ 0.1754, -1.0822, -0.2513,  0.4978],
       [ 0.887 , -1.2095,  1.4749,  1.1566],
       [ 0.0033,  1.1249,  1.4727, -0.2798],
       [-1.213 ,  0.2127,  2.3372,  0.6105]])
In [92]:
data[data < 0] = 0 # 将小于0的值,置0
data
Out[92]:
array([[ 0.1754,  0.    ,  0.    ,  0.4978],
       [ 0.    ,  1.6647,  0.    ,  0.432 ],
       [ 0.887 ,  0.    ,  1.4749,  1.1566],
       [ 0.0033,  1.1249,  1.4727,  0.    ],
       [ 0.    ,  0.2127,  2.3372,  0.6105],
       [ 1.1653,  0.    ,  0.    ,  2.0755],
       [ 0.    ,  0.0232,  0.    ,  1.0971]])
In [93]:
data[names != 'Joe'] = 7
data
Out[93]:
array([[ 7.    ,  7.    ,  7.    ,  7.    ],
       [ 0.    ,  1.6647,  0.    ,  0.432 ],
       [ 7.    ,  7.    ,  7.    ,  7.    ],
       [ 7.    ,  7.    ,  7.    ,  7.    ],
       [ 7.    ,  7.    ,  7.    ,  7.    ],
       [ 1.1653,  0.    ,  0.    ,  2.0755],
       [ 0.    ,  0.0232,  0.    ,  1.0971]])

花式索引

In [94]:
arr = np.empty((8, 4))
for i in range(8):
    arr[i] = i
arr
Out[94]:
array([[ 0.,  0.,  0.,  0.],
       [ 1.,  1.,  1.,  1.],
       [ 2.,  2.,  2.,  2.],
       [ 3.,  3.,  3.,  3.],
       [ 4.,  4.,  4.,  4.],
       [ 5.,  5.,  5.,  5.],
       [ 6.,  6.,  6.,  6.],
       [ 7.,  7.,  7.,  7.]])
In [95]:
arr[[4, 3, 0, 6]]
Out[95]:
array([[ 4.,  4.,  4.,  4.],
       [ 3.,  3.,  3.,  3.],
       [ 0.,  0.,  0.,  0.],
       [ 6.,  6.,  6.,  6.]])
In [96]:
arr[[-3, -5, -7]]
Out[96]:
array([[ 5.,  5.,  5.,  5.],
       [ 3.,  3.,  3.,  3.],
       [ 1.,  1.,  1.,  1.]])
In [98]:
# more on reshape in Chapter 12
arr = np.arange(32).reshape((8, 4))
arr
Out[98]:
array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19],
       [20, 21, 22, 23],
       [24, 25, 26, 27],
       [28, 29, 30, 31]])
In [99]:
arr[[1, 5, 7, 2], [0, 3, 1, 2]]  #行要1 5 7 2,列要0 3 1 2
Out[99]:
array([ 4, 23, 29, 10])
In [100]:
arr[[1, 5, 7, 2]][:, [0, 3, 1, 2]]
Out[100]:
array([[ 4,  7,  5,  6],
       [20, 23, 21, 22],
       [28, 31, 29, 30],
       [ 8, 11,  9, 10]])
In [101]:
arr[np.ix_([1, 5, 7, 2], [0, 3, 1, 2])]
Out[101]:
array([[ 4,  7,  5,  6],
       [20, 23, 21, 22],
       [28, 31, 29, 30],
       [ 8, 11,  9, 10]])

数组转置和轴对换

In [103]:
arr = np.arange(15).reshape((3, 5))
In [104]:
arr
Out[104]:
array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])
In [105]:
arr.T
Out[105]:
array([[ 0,  5, 10],
       [ 1,  6, 11],
       [ 2,  7, 12],
       [ 3,  8, 13],
       [ 4,  9, 14]])
In [106]:
arr = np.random.randn(6, 3)
In [107]:
np.dot(arr.T, arr) # 计算矩阵内积
Out[107]:
array([[ 5.9228,  2.1329,  0.0465],
       [ 2.1329,  5.5028,  0.5327],
       [ 0.0465,  0.5327,  2.4805]])
In [108]:
arr = np.arange(16).reshape((2, 2, 4))
In [109]:
arr
Out[109]:
array([[[ 0,  1,  2,  3],
        [ 4,  5,  6,  7]],

       [[ 8,  9, 10, 11],
        [12, 13, 14, 15]]])
In [110]:
arr.transpose((1, 0, 2)) #轴转置
Out[110]:
array([[[ 0,  1,  2,  3],
        [ 8,  9, 10, 11]],

       [[ 4,  5,  6,  7],
        [12, 13, 14, 15]]])
In [111]:
arr
Out[111]:
array([[[ 0,  1,  2,  3],
        [ 4,  5,  6,  7]],

       [[ 8,  9, 10, 11],
        [12, 13, 14, 15]]])
In [112]:
arr.swapaxes(1, 2)
Out[112]:
array([[[ 0,  4],
        [ 1,  5],
        [ 2,  6],
        [ 3,  7]],

       [[ 8, 12],
        [ 9, 13],
        [10, 14],
        [11, 15]]])

通用函数:快速的元素级数组函数

In [115]:
arr = np.arange(10)
np.sqrt(arr) #每个元素求平方根
Out[115]:
array([ 0.    ,  1.    ,  1.4142,  1.7321,  2.    ,  2.2361,  2.4495,
        2.6458,  2.8284,  3.    ])
In [116]:
np.exp(arr) #计算各元素的指数
Out[116]:
array([    1.    ,     2.7183,     7.3891,    20.0855,    54.5982,
         148.4132,   403.4288,  1096.6332,  2980.958 ,  8103.0839])
In [117]:
x = randn(8)
y = randn(8)
In [118]:
x
Out[118]:
array([ 1.7234,  0.8103, -0.3624, -0.0856, -0.7728,  0.1498,  1.2229,
        1.563 ])
In [119]:
y
Out[119]:
array([-0.3525, -1.0556,  2.1128, -0.239 , -0.5078, -2.6433, -1.0742,
        1.0488])
In [120]:
np.maximum(x, y) # element-wise maximum(元素级最大值)
Out[120]:
array([ 1.7234,  0.8103,  2.1128, -0.0856, -0.5078,  0.1498,  1.2229,
        1.563 ])
In [121]:
arr = randn(7) * 5
In [123]:
arr
Out[123]:
array([ 6.2607, -1.8589, -3.6169, -2.3736,  3.765 ,  2.9481, -1.1368])
In [124]:
np.modf(arr) #将数组的小树和整数部分以两个独立的数组的形式返回 
Out[124]:
(array([ 0.2607, -0.8589, -0.6169, -0.3736,  0.765 ,  0.9481, -0.1368]),
 array([ 6., -1., -3., -2.,  3.,  2., -1.]))

利用数组进行数据处理

In [125]:
points = np.arange(-5, 5, 0.01) # 1000个间隔相等的点
In [126]:
points
Out[126]:
array([-5.  , -4.99, -4.98, -4.97, -4.96, -4.95, -4.94, -4.93, -4.92,
       -4.91, -4.9 , -4.89, -4.88, -4.87, -4.86, -4.85, -4.84, -4.83,
       -4.82, -4.81, -4.8 , -4.79, -4.78, -4.77, -4.76, -4.75, -4.74,
       -4.73, -4.72, -4.71, -4.7 , -4.69, -4.68, -4.67, -4.66, -4.65,
       -4.64, -4.63, -4.62, -4.61, -4.6 , -4.59, -4.58, -4.57, -4.56,
       -4.55, -4.54, -4.53, -4.52, -4.51, -4.5 , -4.49, -4.48, -4.47,
       -4.46, -4.45, -4.44, -4.43, -4.42, -4.41, -4.4 , -4.39, -4.38,
       -4.37, -4.36, -4.35, -4.34, -4.33, -4.32, -4.31, -4.3 , -4.29,
       -4.28, -4.27, -4.26, -4.25, -4.24, -4.23, -4.22, -4.21, -4.2 ,
       -4.19, -4.18, -4.17, -4.16, -4.15, -4.14, -4.13, -4.12, -4.11,
       -4.1 , -4.09, -4.08, -4.07, -4.06, -4.05, -4.04, -4.03, -4.02,
       -4.01, -4.  , -3.99, -3.98, -3.97, -3.96, -3.95, -3.94, -3.93,
       -3.92, -3.91, -3.9 , -3.89, -3.88, -3.87, -3.86, -3.85, -3.84,
       -3.83, -3.82, -3.81, -3.8 , -3.79, -3.78, -3.77, -3.76, -3.75,
       -3.74, -3.73, -3.72, -3.71, -3.7 , -3.69, -3.68, -3.67, -3.66,
       -3.65, -3.64, -3.63, -3.62, -3.61, -3.6 , -3.59, -3.58, -3.57,
       -3.56, -3.55, -3.54, -3.53, -3.52, -3.51, -3.5 , -3.49, -3.48,
       -3.47, -3.46, -3.45, -3.44, -3.43, -3.42, -3.41, -3.4 , -3.39,
       -3.38, -3.37, -3.36, -3.35, -3.34, -3.33, -3.32, -3.31, -3.3 ,
       -3.29, -3.28, -3.27, -3.26, -3.25, -3.24, -3.23, -3.22, -3.21,
       -3.2 , -3.19, -3.18, -3.17, -3.16, -3.15, -3.14, -3.13, -3.12,
       -3.11, -3.1 , -3.09, -3.08, -3.07, -3.06, -3.05, -3.04, -3.03,
       -3.02, -3.01, -3.  , -2.99, -2.98, -2.97, -2.96, -2.95, -2.94,
       -2.93, -2.92, -2.91, -2.9 , -2.89, -2.88, -2.87, -2.86, -2.85,
       -2.84, -2.83, -2.82, -2.81, -2.8 , -2.79, -2.78, -2.77, -2.76,
       -2.75, -2.74, -2.73, -2.72, -2.71, -2.7 , -2.69, -2.68, -2.67,
       -2.66, -2.65, -2.64, -2.63, -2.62, -2.61, -2.6 , -2.59, -2.58,
       -2.57, -2.56, -2.55, -2.54, -2.53, -2.52, -2.51, -2.5 , -2.49,
       -2.48, -2.47, -2.46, -2.45, -2.44, -2.43, -2.42, -2.41, -2.4 ,
       -2.39, -2.38, -2.37, -2.36, -2.35, -2.34, -2.33, -2.32, -2.31,
       -2.3 , -2.29, -2.28, -2.27, -2.26, -2.25, -2.24, -2.23, -2.22,
       -2.21, -2.2 , -2.19, -2.18, -2.17, -2.16, -2.15, -2.14, -2.13,
       -2.12, -2.11, -2.1 , -2.09, -2.08, -2.07, -2.06, -2.05, -2.04,
       -2.03, -2.02, -2.01, -2.  , -1.99, -1.98, -1.97, -1.96, -1.95,
       -1.94, -1.93, -1.92, -1.91, -1.9 , -1.89, -1.88, -1.87, -1.86,
       -1.85, -1.84, -1.83, -1.82, -1.81, -1.8 , -1.79, -1.78, -1.77,
       -1.76, -1.75, -1.74, -1.73, -1.72, -1.71, -1.7 , -1.69, -1.68,
       -1.67, -1.66, -1.65, -1.64, -1.63, -1.62, -1.61, -1.6 , -1.59,
       -1.58, -1.57, -1.56, -1.55, -1.54, -1.53, -1.52, -1.51, -1.5 ,
       -1.49, -1.48, -1.47, -1.46, -1.45, -1.44, -1.43, -1.42, -1.41,
       -1.4 , -1.39, -1.38, -1.37, -1.36, -1.35, -1.34, -1.33, -1.32,
       -1.31, -1.3 , -1.29, -1.28, -1.27, -1.26, -1.25, -1.24, -1.23,
       -1.22, -1.21, -1.2 , -1.19, -1.18, -1.17, -1.16, -1.15, -1.14,
       -1.13, -1.12, -1.11, -1.1 , -1.09, -1.08, -1.07, -1.06, -1.05,
       -1.04, -1.03, -1.02, -1.01, -1.  , -0.99, -0.98, -0.97, -0.96,
       -0.95, -0.94, -0.93, -0.92, -0.91, -0.9 , -0.89, -0.88, -0.87,
       -0.86, -0.85, -0.84, -0.83, -0.82, -0.81, -0.8 , -0.79, -0.78,
       -0.77, -0.76, -0.75, -0.74, -0.73, -0.72, -0.71, -0.7 , -0.69,
       -0.68, -0.67, -0.66, -0.65, -0.64, -0.63, -0.62, -0.61, -0.6 ,
       -0.59, -0.58, -0.57, -0.56, -0.55, -0.54, -0.53, -0.52, -0.51,
       -0.5 , -0.49, -0.48, -0.47, -0.46, -0.45, -0.44, -0.43, -0.42,
       -0.41, -0.4 , -0.39, -0.38, -0.37, -0.36, -0.35, -0.34, -0.33,
       -0.32, -0.31, -0.3 , -0.29, -0.28, -0.27, -0.26, -0.25, -0.24,
       -0.23, -0.22, -0.21, -0.2 , -0.19, -0.18, -0.17, -0.16, -0.15,
       -0.14, -0.13, -0.12, -0.11, -0.1 , -0.09, -0.08, -0.07, -0.06,
       -0.05, -0.04, -0.03, -0.02, -0.01, -0.  ,  0.01,  0.02,  0.03,
        0.04,  0.05,  0.06,  0.07,  0.08,  0.09,  0.1 ,  0.11,  0.12,
        0.13,  0.14,  0.15,  0.16,  0.17,  0.18,  0.19,  0.2 ,  0.21,
        0.22,  0.23,  0.24,  0.25,  0.26,  0.27,  0.28,  0.29,  0.3 ,
        0.31,  0.32,  0.33,  0.34,  0.35,  0.36,  0.37,  0.38,  0.39,
        0.4 ,  0.41,  0.42,  0.43,  0.44,  0.45,  0.46,  0.47,  0.48,
        0.49,  0.5 ,  0.51,  0.52,  0.53,  0.54,  0.55,  0.56,  0.57,
        0.58,  0.59,  0.6 ,  0.61,  0.62,  0.63,  0.64,  0.65,  0.66,
        0.67,  0.68,  0.69,  0.7 ,  0.71,  0.72,  0.73,  0.74,  0.75,
        0.76,  0.77,  0.78,  0.79,  0.8 ,  0.81,  0.82,  0.83,  0.84,
        0.85,  0.86,  0.87,  0.88,  0.89,  0.9 ,  0.91,  0.92,  0.93,
        0.94,  0.95,  0.96,  0.97,  0.98,  0.99,  1.  ,  1.01,  1.02,
        1.03,  1.04,  1.05,  1.06,  1.07,  1.08,  1.09,  1.1 ,  1.11,
        1.12,  1.13,  1.14,  1.15,  1.16,  1.17,  1.18,  1.19,  1.2 ,
        1.21,  1.22,  1.23,  1.24,  1.25,  1.26,  1.27,  1.28,  1.29,
        1.3 ,  1.31,  1.32,  1.33,  1.34,  1.35,  1.36,  1.37,  1.38,
        1.39,  1.4 ,  1.41,  1.42,  1.43,  1.44,  1.45,  1.46,  1.47,
        1.48,  1.49,  1.5 ,  1.51,  1.52,  1.53,  1.54,  1.55,  1.56,
        1.57,  1.58,  1.59,  1.6 ,  1.61,  1.62,  1.63,  1.64,  1.65,
        1.66,  1.67,  1.68,  1.69,  1.7 ,  1.71,  1.72,  1.73,  1.74,
        1.75,  1.76,  1.77,  1.78,  1.79,  1.8 ,  1.81,  1.82,  1.83,
        1.84,  1.85,  1.86,  1.87,  1.88,  1.89,  1.9 ,  1.91,  1.92,
        1.93,  1.94,  1.95,  1.96,  1.97,  1.98,  1.99,  2.  ,  2.01,
        2.02,  2.03,  2.04,  2.05,  2.06,  2.07,  2.08,  2.09,  2.1 ,
        2.11,  2.12,  2.13,  2.14,  2.15,  2.16,  2.17,  2.18,  2.19,
        2.2 ,  2.21,  2.22,  2.23,  2.24,  2.25,  2.26,  2.27,  2.28,
        2.29,  2.3 ,  2.31,  2.32,  2.33,  2.34,  2.35,  2.36,  2.37,
        2.38,  2.39,  2.4 ,  2.41,  2.42,  2.43,  2.44,  2.45,  2.46,
        2.47,  2.48,  2.49,  2.5 ,  2.51,  2.52,  2.53,  2.54,  2.55,
        2.56,  2.57,  2.58,  2.59,  2.6 ,  2.61,  2.62,  2.63,  2.64,
        2.65,  2.66,  2.67,  2.68,  2.69,  2.7 ,  2.71,  2.72,  2.73,
        2.74,  2.75,  2.76,  2.77,  2.78,  2.79,  2.8 ,  2.81,  2.82,
        2.83,  2.84,  2.85,  2.86,  2.87,  2.88,  2.89,  2.9 ,  2.91,
        2.92,  2.93,  2.94,  2.95,  2.96,  2.97,  2.98,  2.99,  3.  ,
        3.01,  3.02,  3.03,  3.04,  3.05,  3.06,  3.07,  3.08,  3.09,
        3.1 ,  3.11,  3.12,  3.13,  3.14,  3.15,  3.16,  3.17,  3.18,
        3.19,  3.2 ,  3.21,  3.22,  3.23,  3.24,  3.25,  3.26,  3.27,
        3.28,  3.29,  3.3 ,  3.31,  3.32,  3.33,  3.34,  3.35,  3.36,
        3.37,  3.38,  3.39,  3.4 ,  3.41,  3.42,  3.43,  3.44,  3.45,
        3.46,  3.47,  3.48,  3.49,  3.5 ,  3.51,  3.52,  3.53,  3.54,
        3.55,  3.56,  3.57,  3.58,  3.59,  3.6 ,  3.61,  3.62,  3.63,
        3.64,  3.65,  3.66,  3.67,  3.68,  3.69,  3.7 ,  3.71,  3.72,
        3.73,  3.74,  3.75,  3.76,  3.77,  3.78,  3.79,  3.8 ,  3.81,
        3.82,  3.83,  3.84,  3.85,  3.86,  3.87,  3.88,  3.89,  3.9 ,
        3.91,  3.92,  3.93,  3.94,  3.95,  3.96,  3.97,  3.98,  3.99,
        4.  ,  4.01,  4.02,  4.03,  4.04,  4.05,  4.06,  4.07,  4.08,
        4.09,  4.1 ,  4.11,  4.12,  4.13,  4.14,  4.15,  4.16,  4.17,
        4.18,  4.19,  4.2 ,  4.21,  4.22,  4.23,  4.24,  4.25,  4.26,
        4.27,  4.28,  4.29,  4.3 ,  4.31,  4.32,  4.33,  4.34,  4.35,
        4.36,  4.37,  4.38,  4.39,  4.4 ,  4.41,  4.42,  4.43,  4.44,
        4.45,  4.46,  4.47,  4.48,  4.49,  4.5 ,  4.51,  4.52,  4.53,
        4.54,  4.55,  4.56,  4.57,  4.58,  4.59,  4.6 ,  4.61,  4.62,
        4.63,  4.64,  4.65,  4.66,  4.67,  4.68,  4.69,  4.7 ,  4.71,
        4.72,  4.73,  4.74,  4.75,  4.76,  4.77,  4.78,  4.79,  4.8 ,
        4.81,  4.82,  4.83,  4.84,  4.85,  4.86,  4.87,  4.88,  4.89,
        4.9 ,  4.91,  4.92,  4.93,  4.94,  4.95,  4.96,  4.97,  4.98,  4.99])
In [127]:
xs, ys = np.meshgrid(points, points) #矢量化
In [128]:
xs
Out[128]:
array([[-5.  , -4.99, -4.98, ...,  4.97,  4.98,  4.99],
       [-5.  , -4.99, -4.98, ...,  4.97,  4.98,  4.99],
       [-5.  , -4.99, -4.98, ...,  4.97,  4.98,  4.99],
       ..., 
       [-5.  , -4.99, -4.98, ...,  4.97,  4.98,  4.99],
       [-5.  , -4.99, -4.98, ...,  4.97,  4.98,  4.99],
       [-5.  , -4.99, -4.98, ...,  4.97,  4.98,  4.99]])
In [129]:
ys
Out[129]:
array([[-5.  , -5.  , -5.  , ..., -5.  , -5.  , -5.  ],
       [-4.99, -4.99, -4.99, ..., -4.99, -4.99, -4.99],
       [-4.98, -4.98, -4.98, ..., -4.98, -4.98, -4.98],
       ..., 
       [ 4.97,  4.97,  4.97, ...,  4.97,  4.97,  4.97],
       [ 4.98,  4.98,  4.98, ...,  4.98,  4.98,  4.98],
       [ 4.99,  4.99,  4.99, ...,  4.99,  4.99,  4.99]])
In [130]:
from matplotlib.pyplot import imshow, title
In [137]:
import matplotlib.pyplot as plt
z = np.sqrt(xs ** 2 + ys ** 2)
In [138]:
z
Out[138]:
array([[ 7.0711,  7.064 ,  7.0569, ...,  7.0499,  7.0569,  7.064 ],
       [ 7.064 ,  7.0569,  7.0499, ...,  7.0428,  7.0499,  7.0569],
       [ 7.0569,  7.0499,  7.0428, ...,  7.0357,  7.0428,  7.0499],
       ..., 
       [ 7.0499,  7.0428,  7.0357, ...,  7.0286,  7.0357,  7.0428],
       [ 7.0569,  7.0499,  7.0428, ...,  7.0357,  7.0428,  7.0499],
       [ 7.064 ,  7.0569,  7.0499, ...,  7.0428,  7.0499,  7.0569]])
In [133]:
plt.imshow(z, cmap=plt.cm.gray); plt.colorbar()
Out[133]:
<matplotlib.colorbar.Colorbar at 0x6912210>
利用python进入数据分析之Numpy基础知识
In [134]:
plt.title("Image plot of $\sqrt{x^2 + y^2}$ for a grid of values")
Out[134]:
<matplotlib.text.Text at 0x6bf7630>
利用python进入数据分析之Numpy基础知识
In [136]:
plt.draw()
<matplotlib.figure.Figure at 0x6c11a70>

将条件逻辑表述为数组运算

In [139]:
xarr = np.array([1.1, 1.2, 1.3, 1.4, 1.5])
yarr = np.array([2.1, 2.2, 2.3, 2.4, 2.5])
cond = np.array([True, False, True, True, False])
In [140]:
result = [(x if c else y)
          for x, y, c in zip(xarr, yarr, cond)]
result
Out[140]:
[1.1000000000000001, 2.2000000000000002, 1.3, 1.3999999999999999, 2.5]
In [141]:
result = np.where(cond, xarr, yarr)
result
Out[141]:
array([ 1.1,  2.2,  1.3,  1.4,  2.5])
In [142]:
arr = randn(4, 4)
arr
Out[142]:
array([[-0.4783, -1.3719,  1.1728, -1.0682],
       [ 0.5183, -0.0492,  0.2473,  0.4881],
       [-0.9058, -0.3842, -0.7508,  0.2185],
       [ 0.491 ,  1.1031,  0.9132, -1.251 ]])
In [144]:
np.where(arr > 0, 2, -2)
Out[144]:
array([[-2, -2,  2, -2],
       [ 2, -2,  2,  2],
       [-2, -2, -2,  2],
       [ 2,  2,  2, -2]])
In [145]:
np.where(arr > 0, 2, arr) # set only positive values to 2
Out[145]:
array([[-0.4783, -1.3719,  2.    , -1.0682],
       [ 2.    , -0.0492,  2.    ,  2.    ],
       [-0.9058, -0.3842, -0.7508,  2.    ],
       [ 2.    ,  2.    ,  2.    , -1.251 ]])

数学和统计方法

In [150]:
arr = np.random.randn(5, 4) # 正态分布数据
arr.mean()
Out[150]:
-0.016709442063762382
In [151]:
np.mean(arr) #平均数
Out[151]:
-0.016709442063762382
In [154]:
arr.sum() # 求和
Out[154]:
-0.33418884127524762
In [156]:
arr
Out[156]:
array([[ 0.4104,  0.6597,  1.8797, -1.0583],
       [-0.8747, -0.2524, -2.3187,  0.9536],
       [-0.5245,  2.3237, -0.3975,  0.9139],
       [ 0.367 , -0.3131, -1.5755,  0.7737],
       [ 0.4162, -0.9881,  0.1237, -0.8532]])
In [157]:
arr.mean(axis=1)
Out[157]:
array([ 0.4729, -0.6231,  0.5789, -0.187 , -0.3253])
In [158]:
arr = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8]])
In [159]:
arr.cumsum(0) # 所有元素的累计和
Out[159]:
array([[ 0,  1,  2],
       [ 3,  5,  7],
       [ 9, 12, 15]])
In [161]:
arr.cumprod(1)# 所有元素的累计积
Out[161]:
array([[  0,   0,   0],
       [  3,  12,  60],
       [  6,  42, 336]])

用于布尔型数组的方法

In [163]:
arr = randn(100)
In [167]:
arr
Out[167]:
array([-1.4265, -0.6694,  2.2821,  3.0191, -2.0106, -0.0285, -0.0832,
       -1.1128, -0.4206, -1.511 ,  2.2606, -0.7998, -0.2611, -1.0766,
        1.1683, -0.697 ,  0.5537, -1.8937,  0.6898, -0.0265,  0.28  ,
       -1.4975,  0.2975,  0.9219,  0.232 ,  0.6146,  0.8859,  1.8052,
       -0.4248,  0.3869, -1.411 , -0.6007,  1.1243,  0.729 ,  0.413 ,
       -1.9315,  0.483 ,  0.1767, -0.2112,  0.5936,  0.3184, -0.1198,
       -0.7269, -1.1918, -0.4347, -0.545 , -0.9096,  0.6451, -0.4286,
        0.3266,  1.7095,  0.3773, -1.0539,  0.3573,  0.167 ,  0.4146,
       -0.5471,  0.9067,  1.244 ,  0.576 ,  1.3393,  0.2889,  0.7406,
        0.3776,  0.3121, -1.0605, -0.3497, -0.8814,  0.0519,  0.4618,
       -0.3093, -0.8532, -0.0788,  0.1572,  0.8356, -0.6176,  1.9662,
       -0.6302, -0.0095, -0.0759, -0.3393, -1.6962, -0.1854, -0.0469,
       -0.7051, -0.1093,  0.6321,  1.8718, -0.9474, -0.6264,  0.0707,
        1.1023, -1.8832, -1.8168, -2.4461,  0.6357,  0.2596,  0.9758,
        1.2792,  1.1529])
In [166]:
(arr > 0).sum() # Number of positive values
Out[166]:
50
In [168]:
bools = np.array([False, False, True, False])
In [169]:
bools.any() # 测试数组中是否存在一个或多个TRUE
Out[169]:
True
In [170]:
bools.all()# 测试数组中的值是否都是TRUE
Out[170]:
False

排序

In [171]:
arr = randn(8)
In [172]:
arr
Out[172]:
array([-0.0019, -1.0402, -0.166 , -0.5647,  0.5247, -0.7469,  0.6596,
        0.8014])
In [173]:
arr.sort()
In [174]:
arr
Out[174]:
array([-1.0402, -0.7469, -0.5647, -0.166 , -0.0019,  0.5247,  0.6596,
        0.8014])
In [175]:
arr = randn(5, 3)
In [176]:
arr
Out[176]:
array([[-0.5776,  0.0369, -0.5198],
       [ 0.8355,  1.4866, -0.0847],
       [ 2.2603, -1.2001,  0.1705],
       [-0.8467, -0.0015,  0.8709],
       [-0.884 , -0.5404, -1.6182]])
In [177]:
arr.sort(1)
In [178]:
arr
Out[178]:
array([[-0.5776, -0.5198,  0.0369],
       [-0.0847,  0.8355,  1.4866],
       [-1.2001,  0.1705,  2.2603],
       [-0.8467, -0.0015,  0.8709],
       [-1.6182, -0.884 , -0.5404]])

唯一化以及其他的集合逻辑

In [185]:
names = np.array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'])
np.unique(names) # 找出数组中的唯一值
Out[185]:
array(['Bob', 'Joe', 'Will'], 
      dtype='|S4')
In [186]:
ints = np.array([3, 3, 3, 2, 2, 1, 1, 4, 4])
np.unique(ints)
Out[186]:
array([1, 2, 3, 4])

用于数组文件的输入输出

将数组以二进制格式保存到磁盘

In [188]:
arr = np.arange(10)
np.save('some_array', arr)
In [189]:
np.load('some_array.npy')
Out[189]:
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
In [190]:
np.savez('array_archive.npz', a=arr, b=arr) # 用压缩文件的方式加载
In [191]:
arch = np.load('array_archive.npz') # 压缩文件的方式导入
arch['b']
Out[191]:
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

线性代数

In [194]:
x = np.array([[1., 2., 3.], [4., 5., 6.]])
y = np.array([[6., 23.], [-1, 7], [8, 9]])
In [195]:
x
Out[195]:
array([[ 1.,  2.,  3.],
       [ 4.,  5.,  6.]])
In [196]:
y
Out[196]:
array([[  6.,  23.],
       [ -1.,   7.],
       [  8.,   9.]])
In [197]:
x.dot(y)  # 计算内积
Out[197]:
array([[  28.,   64.],
       [  67.,  181.]])
In [198]:
np.dot(x, np.ones(3))
Out[198]:
array([  6.,  15.])

随机数生成

In [200]:
samples = np.random.normal(size=(4, 4))
In [201]:
samples
Out[201]:
array([[-0.2047,  0.4789, -0.5194, -0.5557],
       [ 1.9658,  1.3934,  0.0929,  0.2817],
       [ 0.769 ,  1.2464,  1.0072, -1.2962],
       [ 0.275 ,  0.2289,  1.3529,  0.8864]])
In [202]:
from random import normalvariate
N = 1000000
%timeit samples = [normalvariate(0, 1) for _ in xrange(N)]
1 loop, best of 3: 1.05 s per loop
In [204]:
%timeit np.random.normal(size=N) # np的random.normal模块要比python内置的random块
10 loops, best of 3: 48.6 ms per loop

范例:随机漫步

In [205]:
import random
position = 0
walk = [position]
steps = 1000
for i in xrange(steps):
    step = 1 if random.randint(0, 1) else -1
    position += step
    walk.append(position)
In [236]:
nsteps = 1000
draws = np.random.randint(0, 2, size=nsteps) #随机生成0 1共1000个
steps = np.where(draws > 0, 1, -1)# 进行累加
walk = steps.cumsum()
In [237]:
walk.min()
Out[237]:
-19
In [238]:
walk.max()
Out[238]:
40
In [239]:
(np.abs(walk) >= 10).argmax() #多久才能超过绝对值10
Out[239]:
99

一次模拟多个随机漫步

In [242]:
nwalks = 5000
nsteps = 1000
draws = np.random.randint(0, 2, size=(nwalks, nsteps)) # 0 or 1
steps = np.where(draws > 0, 1, -1) #将0和1变成-1和1
walks = steps.cumsum(1)
walks
Out[242]:
array([[  1,   2,   3, ..., -30, -31, -32],
       [ -1,   0,  -1, ...,  22,  23,  24],
       [ -1,   0,   1, ...,  -8,  -9,  -8],
       ..., 
       [  1,   0,  -1, ..., -22, -23, -24],
       [ -1,  -2,  -3, ...,  -8,  -7,  -6],
       [ -1,   0,  -1, ...,  -6,  -5,  -6]])
In [244]:
walks.max()
Out[244]:
113
In [245]:
walks.min()
Out[245]:
-111
In [246]:
hits30 = (np.abs(walks) >= 30).any(1)
hits30
Out[246]:
array([ True,  True, False, ...,  True, False, False], dtype=bool)
In [247]:
hits30.sum()
Out[247]:
3308
In [248]:
crossing_times = (np.abs(walks[hits30]) >= 30).argmax(1)
crossing_times.mean()
Out[248]:
503.45163240628779
In [249]:
steps = np.random.normal(loc=0, scale=0.25,
                         size=(nwalks, nsteps))
In [250]:
steps
Out[250]:
array([[ 0.0059, -0.2851, -0.1893, ..., -0.0766,  0.1219, -0.2726],
       [ 0.0941, -0.0111, -0.0559, ...,  0.3619, -0.1382,  0.1832],
       [-0.3799,  0.2321,  0.2399, ...,  0.2371, -0.1236,  0.5164],
       ..., 
       [ 0.1875, -0.1028, -0.2188, ...,  0.0617,  0.0766,  0.0678],
       [-0.4936,  0.5369, -0.0621, ..., -0.2358,  0.4583,  0.0522],
       [-0.029 ,  0.2028,  0.116 , ..., -0.1437, -0.155 , -0.1732]])