python大规模机器学习day3-使用数据库包sqlite3
使用数据库包sqlite3
实验要求:
1.学会使用SQLite3数据库
2.使用简单查询得到数据流
3.数据上传到数据库
4.流处理
实验内容:
1.流处理方式自动创建数据库
代码注释:
import os ,sys #os库用来处理路径信息,sys库用于帮助进行数据库的操作
import sqlite3, csv,glob #sqlite3用于数据库的操作,csv用于读写csv文件,glob用于模式匹配
SEP=’,’
def define_field(s):
try:
int(s)
return ‘integer’
except ValueError:
try:
float(s)
return ‘real’
except:
return’text’
def create_sqlite_db(db=‘database.sqlite’, file_pattern=’’): #参数中创建了一个数据库,用sqlite函数实现,此处用对象调用的方法。模式匹配参数用来匹配0或多个字符
conn = sqlite3.connect(db)**#connect用于打开非持久的数据库连接。此处函数创建一个数据库****
conn.text_factory = str # allows utf-8 data to be stored #定义该数据库的编码属性
c = conn.cursor() #cursor用来执行命令的方法,c作为一个对象,专门调用方法函数
# traverse the directory and process each .csv file useful for building the db
target_files = glob.glob(file_pattern) #glob模块和windows模块下的文件搜索差不多,此处没有加路径只有模式匹配参数,则默认路径为当前目录
print(‘Creating %i table(s) into %s from file(s): %s’ % (len(target_files), db, ‘,’.join(target_files)))
for k, csvfile in enumerate(target_files): #对目标文件进行迭代
# remove the path and extension and use what’s left as a tablename
tablename = os.path.splitext(os.path.basename(csvfile))[0] #splitext用于分开显示文件名和扩展名。basename用于返回文件名
with open(csvfile, “rt”)as f:
reader = csv.reader(f, delimiter=SEP)
f.seek(0) #0代表从文件开头开始算起
for n, row in enumerate(reader):
if n == 1:
types = map(define_field, row) #map会根据指定函数对序列作映射
else:
if n > 11:
break
f.seek(0)
for n,row in enumerate(reader):
if n==0:
sql = “DROP TABLE IF EXISTS %s” % tablename
c.execute(sql)
sql=“CREATE TABLE %s(%s)” %(tablename,",".join(["%s %s" % (col,ct) for col,ct in zip(row,types)])) #zip函数将列表打包为元组
print(’%i) %s’ %(k+1,sql))
c.execute(sql) #execute函数可以执行字符串内容,当作命令
#Creating indexs for faster joins on long strings
for column in row:
if column in row:
if column.endswitch(“ID_hash"): #endswitch判断字符串是否以指定字符或子字符串结尾。
index = "%s_%s” %(tablename,column)
sql=“CREATE INDEX %s on $s (%s)” %(index,tablename,column)
c.execute(sql)
insertsql = “INSERT INTO %s VALUES (%s)” % (tablename,",".join(["?" for column in row]))
rowlen = len(row)
else:
#raise an error if there are rows that don’t have the right number of fields
if len(row) ==rowlen:
c.execute(insertsql, row)
else:
print(‘Error at line %i in file %s’) %(n,csvfile)
raise ValueError(‘Houston, we’ve had a problem at row %i’ % n)
conn.commit() #commit用于提交当前数据库的事务
print (’* Insert %i rows’ % n)
c.close()
conn.close()
运行结果:
源代码:
import os ,sys
import sqlite3, csv,glob
SEP=’,’
def define_field(s):
try:
int(s)
return ‘integer’
except ValueError:
try:
float(s)
return ‘real’
except:
return’text’
def create_sqlite_db(db=‘database.sqlite’, file_pattern=’’):
conn = sqlite3.connect(db)
conn.text_factory = str # allows utf-8 data to be stored
c = conn.cursor()
# traverse the directory and process each .csv file useful for building the db
target_files = glob.glob(file_pattern)
print(‘Creating %i table(s) into %s from file(s): %s’ % (len(target_files), db, ‘,’.join(target_files)))
for k, csvfile in enumerate(target_files):
# remove the path and extension and use what’s left as a tablename
tablename = os.path.splitext(os.path.basename(csvfile))[0]
with open(csvfile, “rt”)as f:
reader = csv.reader(f, delimiter=SEP)
f.seek(0)
for n, row in enumerate(reader):
if n == 1:
types = map(define_field, row)
else:
if n > 11:
break
f.seek(0)
for n,row in enumerate(reader):
if n==0:
sql = “DROP TABLE IF EXISTS %s” % tablename
c.execute(sql)
sql=“CREATE TABLE %s(%s)” %(tablename,",".join(["%s %s" % (col,ct) for col,ct in zip(row,types)]))
print(’%i) %s’ %(k+1,sql))
c.execute(sql)
#Creating indexs for faster joins on long strings
for column in row:
if column.endswith(“ID_hash"):
index = "%s_%s” %(tablename,column)
sql=“CREATE INDEX %s on %s (%s)” %(index,tablename,column)
c.execute(sql)
insertsql = “INSERT INTO %s VALUES (%s)” % (tablename,",".join(["?" for column in row]))
rowlen = len(row)
else:
#raise an error if there are rows that don’t have the right number of fields
if len(row) ==rowlen:
c.execute(insertsql, row)
else:
print(‘Error at line %i in file %s’) %(n,csvfile)
raise ValueError(‘Houston, we’ve had a problem at row %i’ % n)
conn.commit()
print (’* Inserted %i rows’ % n)
c.close()
conn.close()
create_sqlite_db(db=‘bikesharing.sqlite’,file_pattern=‘bikesharing\*.csv’)
实验总结:
可以在python中使用sqlite包模拟数据库,完成创建数据库。