Bar plotting or hist plotting
In [74]: import numpy as np
import matplotlib.pyplot as plt
In [75]: data = [5., 25., 50, 20]
In [79]: plt.bar( range(len(data)), data )
plt.show()
In [80]: range(len(data))
Out[80]: range(0, 4)
In [81]: plt.barh( range(len(data)), data)
plt.show()
In [ ]:
In [82]: import matplotlib.pyplot as plt
In [83]: data = [5,25,50,20]
In [84]: plt.bar( range(len(data)), data, width=1)
plt.show()
In [87]: plt.barh( range(len(data)), data, height=1)
plt.show()
In [89]:
In [ ]:
17
In [90]: import numpy as np
import matplotlib.pyplot as plt
In [96]: data = [ [5,25,50,20], [4,23,51,17], [6,22,52,19] ]
XList = np.arange(4)
In [97]: w=0.25
plt.bar(XList+w*0, data[0], color='b', width=w) #w*0 or w*1 or w*2 做偏移
# center the first blue bar to 0.0
plt.bar(XList+w*1, data[1], color='g', width=w)
#width=w is the gap
plt.bar(XList+w*2, data[2], color='r', width=w)
# center the first red bar to 0.5
plt.show()
In [ ]:
In [99]: import numpy as np
import matplotlib.pyplot as plt
In [100]: data = [[5,25,50,20], [4,23,51,17], [6,22,52,19]]
color_list=['b','g','r']
gap = .8/len(data)
In [101]: for i, row in enumerate(data): #the iterator enumerate returns both the current row and its i
X = np.arange(len(row))
plt.bar(X+i*gap, row, width=gap, color = color_list[ i%len(color_list) ])
#i%len(color_list) if len(data)>3
plt.show()
In [ ]:
In [102]: import matplotlib.pyplot as plt
In [106]: A = [5,30,45,22] B = [5,25,50,20]
XList= range(len(A))
In [107]: plt.bar(XList, A, color='b')
plt.bar(XList, B, color='r', bottom=A) #default width: 0.8
plt.show()
In [ ]:
In [108]: import numpy as np
import matplotlib.pyplot as plt
In [109]: A = np.array([5,30,45,22])
B = np.array([5,20,50,20])
C = np.array([1,2,1,1])
xList=np.arange(4)
In [110]: plt.bar(xList, A, color='b')
plt.bar(xList, B, color='y', bottom=A)
plt.bar(xList, C, color='r', bottom=A+B)
plt.show()
20
In [ ]:
In [111]: import numpy as np
import matplotlib.pyplot as plt
In [112]: data=np.array([ [5,30,45,22], [5,20,50,20], [1, 2, 1, 1] ])
colorList = ['b', 'y','r']
xList = np.arange(data.shape[1]) #data.shape[1] return the number of columns
for i in range(data.shape[0]): #axis=0 to remove row: sum by column
plt.bar(xList, data[i], bottom=np.sum(data[:i], axis=0), color=color_list[ i%len(color_list) ])
plt.show()
21
In [ ]:
In [113]: import numpy as np
import matplotlib.pyplot as plt
In [115]: women_pop = np.array([5,30,45,22])
men_pop = np.array([5,25,50,20])
xList= np.arange(4)
In [116]: plt.barh(xList, women_pop, color='r')
plt.barh(xList, -men_pop, color='b')
plt.show()
In [ ]:
In [119]: import numpy as np
import matplotlib.pyplot as plt
In [120]: xList = np.random.randn(1000)
In [121]: plt.hist(xList, bins=20)
plt.show()
In [122]: plt.hist(xList, bins=50)
plt.show()
In [ ]:
3 Using custom colors for bar charts
In [13]: import numpy as np
import matplotlib.pyplot as plt
In [14]: women_pop = np.array([5.0, 30.,45., 22.])
men_pop = np.array([5.0, 25., 50., 20.])
In [15]: X=np.arange(4) #0~3
In [18]: plt.barh(X, women_pop, color='0.25')
plt.barh(X, -men_pop, color='0.75')# The parameter edgecolor is alsoavailable
plt.show().
Out[18]:
In [19]: import numpy as np
import matplotlib.pyplot as plt
In [21]: values = np.random.randint(99, size=50) #generating 50 numbers with 0<=values<=99
In [22]: values
Out[22]: array([ 8, 21, 30, 9, 74, 0, 91, 97, 81, 80, 21, 47, 18, 3, 81, 53, 22, 84, 50, 2, 33, 82, 93, 89, 51, 71, 87, 48, 0, 57, 15, 38, 66, 48, 75, 98, 46, 35, 33, 20, 28, 30, 20, 80, 83, 68, 29, 13, 38, 61])
In [25]: color_set = ('.00','.25','.50','.75')
##Python中的 // 与 / 的区别, " / " 表示浮点数除法
color_list=[ color_set[ (len(color_set) * val) //100 ] for val in values ]
plt.bar(np.arange(len(values)), values, color=color_list)
plt.show()
In [26]: (len(color_set) * 8) //100
Out[26]: 0
In [28]: color_set = ('.00','.25','.50','.75')
##Python中的 // 与 / 的区别, " / " 表示浮点数除法, "//"表示整数除法
color_list=[ color_set[ (len(color_set) * val) //100 ] for val in sorted(values) ] #sort the v
plt.bar(np.arange(len(values)), values, color=color_list)
plt.show()
7 Using colormaps for bar charts
In [11]: import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as col
import matplotlib.pyplot as plt
In [13]: values = np.random.randint(99, size=50) # 50 numbers (0~99)
In [14]: #normalize data into the [0.0, 1.0] interval
cmap = cm.ScalarMappable(col.Normalize(0,99), cm.binary)
In [16]: #converts the list of values to a list of color
plt.bar(np.arange(len(values)), values, color = cmap.to_rgba(values))
plt.show()
# # we use the linestyle parameter of pyplot.plot() to control the line
# pattern of three different curves. The following line styles are available:
# ### Solid
# ### Dashed
# ### Dotted
# ### Dashdot
The line style with other plot types
# In[24]:
import numpy as np
import matplotlib.pyplot as plt
# In[25]:
N=8
A=np.random.random(N)
B=np.random.random(N)
X=np.arange(N)
# In[60]:
plt.bar(X, A, color='0.75')
##edgecolor='y'
b=plt.bar(X, A+B, bottom=A, color='w', linestyle='dashed', linewidth=1,edgecolor='y')
plt.show()
Controlling a fill pattern
hatch pattern
# /
# \
# |
# -
# +
# x
# o
# O
# .
# *
edgecolor parameter will control the color of the hatching.
# In[70]:
import numpy as np
import matplotlib.pyplot as plt
# In[72]:
N = 8
A = np.random.random(N)
B = np.random.random(N)
X = np.arange(N)
# In[77]:
#edgecolor parameter will control the color of the hatching.
plt.bar(X, A, color='w', hatch='x', edgecolor='k')
plt.bar(X, A+B, bottom=A, color='W', hatch='/', edgecolor='k')
plt.show()
Bar Plots¶
The plot.bar() and plot.barh() make vertical and horizontal bar plots, respectively
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
fig, axes = plt.subplots(2,1) #row=2 column=1
data = pd.Series(np.random.rand(16), index=list('abcdefghijklmnop')) #rand(): [0,1)
data.plot.bar(ax=axes[0], color='k', alpha=0.7, rot=0)
data.plot.barh(ax=axes[1],color='b', alpha=0.7)
plt.show()
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.rand(6,4),
index=['one', 'two', 'three', 'four', 'five', 'six'],
columns=pd.Index(['A','B','C','D'], name='Genus'))
df
df.plot.barh(stacked=True,alpha=0.5,rot=0)
plt.legend(loc='upper right',title='Genus')
plt.show()
A useful recipe for bar plots is to visualize a Series’s value frequency using value_counts: s.value_counts().plot.bar().
#######################################################
tips.csv
#######################################################
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
tips = pd.read_csv('../examples/tips.csv')
tips.head()
#axis[0] #axis[1]
party_counts = pd.crosstab(tips['day'], tips['size'])
party_counts
party_counts = party_counts.loc[:,2:5] #label[2,3,4,5]
party_counts
# Normalize to sum to 1
party_pcts = party_counts.div(party_counts.sum(1), axis=0)
#groub by (axis=0) day, 16/(16+1+1+0)= 0.888889
party_pcts
party_pcts.plot.bar(rot=90)
plt.show()
Conclusion:
So you can see that party sizes appear to increase on the weekend in this dataset.
seaborn
import seaborn as sns
import matplotlib.pyplot as plt
tips = pd.read_csv('../examples/tips.csv')
tips.head()
#0.063204 = 1.01 / (16.99 - 1.01 )
tips['tip_pct'] = tips['tip'] / (tips['total_bill'] - tips['tip'])
tips.head()
sns.barplot(data=tips, x='tip_pct', y='day', orient='h')
sns.set(style=None)
plt.show() #The black lines drawn on the bars represent the 95% confidence interval
sns.barplot(data=tips, x='tip_pct', y='day', orient='h', hue='time')
sns.set(style='whitegrid')
plt.legend(loc='center right', title='time')
plt.show()
help(sns.set)
help(sns.axes_style)
Histograms and Density Plots¶
A histogram is a kind of bar plot that gives a discretized display of value frequency. The data points are split into discrete, evenly spaced bins, and the number of data points in each bin is plotted.
import seaborn as sns
import matplotlib.pyplot as plt
tips = pd.read_csv('../examples/tips.csv')
tips.head()
tips['tip_pct'] = tips['tip'] / (tips['total_bill'] - tips['tip']) #sorted then split the data points (depend on their values)
tips.head()
tips['tip_pct'].plot.hist(bins=50) #sorted tips['tip_pct'] then split the data points (depend on their values) to 50 bins
plt.title('Histogram of tip percentages')
plt.show()
A related plot type is a density plot, which is formed by computing an estimate of a
continuous probability distribution that might have generated the observed data.
density plots are also known as kernel density estimate (KDE) plots.
Using plot.kde makes a density plot using the conventional mixture-of-normals estimate
tips['tip_pct'].plot.density()
plt.title('Density plot of tip percentages')
plt.show()
高斯分布(Gaussian Distribution)的概率密度函数(probability density function):
np.random.randn(size)
所谓标准正态分布(μ=0,σ=1),对应于np.random.normal(loc=0, scale=1, size)
#normal distribution mu=0, sigma=1=std.dev
Seaborn makes histograms and density plots even easier through its distplot
method, which can plot both a histogram and a continuous density estimate simultaneously.
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
#mu=0 sigma=1=std.dev sampling size=200
comp1 = np.random.normal(0,1,size=200)
#mu=10, sigma=2=std.dev
comp2 = np.random.normal(10,2, size=200)
values= pd.Series(np.concatenate([comp1, comp2]))
sns.distplot(values, bins=100, color='k')
plt.title('Normalized histogram of normal mixture with density estimate')
plt.show()
Figures and Subplots
Plots in matplotlib reside within a Figure object.
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
#pass no label or label='_nolegend_'
ax.plot(randn(1000).cumsum(), color='k', label='one')
ax.plot(randn(1000).cumsum(), color='k', linestyle='--', label='two')
ax.plot(randn(1000).cumsum(), color='k', linestyle='dotted',label='three')
ticks = ax.set_xticks([0,250,500,750,1000])
labels = ax.set_xticklabels(['one','two', 'three', 'four', 'five'], rotation=30, fontsize='small')
ax.set_title('My first matplotlib plot')
ax.set_xlabel('Stages')
////////////////////////////////////////
props = {
'title': 'My first matplotlib plot',
'xlabel': 'Stages'
}
ax.set(**props)
////////////////////////////////////////
ax.legend(loc='best')
plt.show()
matplotlib draws on the last figure and subplot used (creating one if necessary), thus hiding the figure and subplot creation.
plt.plot(np.random.randn(50).cumsum(), color='black', ls='--')
matplotlib includes a convenience method, plt.subplots, that creates a new figure and returns a NumPy array containing the created subplot objects, the axes array can be easily indexed like a two-dimensional array; for example, axes[0, 1].
Adjusting the spacing around subplots
plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=None, hspace=None)
# 1
fig, axes = plt.subplots(2,2, sharex=True, sharey= True)
for i in range(2):
for j in range(2):
axes[i,j].hist(np.random.randn(500), bins =5, color='k', alpha=0.5)
plt.subplots_adjust(wspace=0.05, hspace=0.05)
# 2
from numpy.random import randn
arr=randn(30)
arrCumSum=arr.cumsum()
plt.plot(arrCumSum, color='k', linestyle='dashed', drawstyle='steps-post', label='steps-post', marker='o')
plt.legend(loc='best') #label='steps-post'
plt.show()
Annotations and Drawing on a Subplot
import numpy as np
import pandas as pd
from datetime import datetime
#index_col : int or sequence or False, default None
data = pd.read_csv('../examples/spx.csv',parse_dates=True, index_col=0)
spx = data['SPX'] #'SPX' column
crisis_data=[
(datetime(2007, 10, 11), 'Peak of bull market'), #tuple
(datetime(2008, 3, 12), 'Bear Stearns Fails'),
(datetime(2008, 9, 15), 'Lehman Bankruptcy')
]
# // matplotlib Configuration
plt.rc('figure', figsize=(10,10))
font_options={
'family': 'monospace',
'weight': 'bold',
'size': 16
}
plt.rc('font', **font_options)
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
spx.plot(ax=ax, color='green', linestyle='-')
for date, label in crisis_data:
ax.annotate( label,
ha='left',
va='top',
xytext=(date, spx.asof(date) + 225), #The xytext parameter specifies the text position.
xy=(date, spx.asof(date) + 75), #The xy parameter specifies the arrow's destination
arrowprops=dict(facecolor='blue', headwidth=10, headlength=4, width=2 ),
#arrowprops={'facecolor':'blue', 'headwidth':10, 'headlength':4, 'width':2}
)
#Zoom in on 2007-2010
ax.set_xlim(['1/1/2007', '1/1/2011'])
ax.set_ylim([600,1800])
ax.set_title('Important dates in the 2008-2009 financial crisis')
plt.show()
Adding arrows
The aspect of the arrow is controlled by a dictionary passed to the arrowprops parameter: 'arrowstyle': The parameters ''<-'', ''<'', ''-'', ''wedge'',''simple'', and ''fancy'' control the style of the arrow 'facecolor': This is the color used for the arrow. It will be used to set the background and the edge color 'edgecolor': This is the color used for the edges of the arrow's shape 'alpha': This is used to set the transparency level so that the arrow blends with the background
The shrink parameter controls the gap between the arrow's endpoints and the arrow itself.
Facet Grids分面网格 and Categorical Data类型数据
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
tips = pd.read_csv('../examples/tips.csv')
tips.head()
tips['tip_pct'] = tips['tip'] / (tips['total_bill'] - tips['tip'])
tips.head()
#categorical data
sns.factorplot(x='day', y='tip_pct', hue='time', col='smoker', kind='bar', data=tips[tips.tip_pct <1])
plt.show()
#categorical data
sns.factorplot(x='day', y='tip_pct', row='time', col='smoker', kind='bar', data=tips[tips.tip_pct <1])
plt.show()
sns.factorplot(x='tip_pct', y='day', kind='box', data=tips[tips.tip_pct<0.5])
plt.show()