Spark DataFrame vs Dataset
DataFrame vs Dataset
DataFrame = Dataset[Row]
SchemaRDD ---------->DataFrame ---------->Dataset
rename due to compile-time type safety
OO structure change
compile-time type safety:在下代码的时候就把问题暴露出来 DataFrane和DataSet具有类似的方法 DataSet由DataFrame转换而来
import org.apache.spark.sql.SparkSession /** * Created by Administrator on 2018/6/8. */ object DatasetApp { def main(args: Array[String]): Unit = { val spark = SparkSession.builder() .appName("DatasetApp") .master("local[2]") .getOrCreate() import spark.implicits._ //创建DF val csvDF = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("file:///D:/Data/123.csv") //val csvDF = spark.read.format("csv").option("header","true").option("inferSchema","true").load("file:///disk4/data/123.csv") //DF转化为DS val ds = csvDF.as[CsvFile] /* 好处是可以再code的时候就解析列名的正确性 */ val selectedDF = csvDF.select("id") //.show() val selectedDS = ds.map(x => x.id) //.show() //查看query执行计划,查看执行计划 selectedDF.queryExecution.optimizedPlan.numberedTreeString selectedDS.queryExecution.optimizedPlan.numberedTreeString ds.printSchema() spark.stop() } case class CsvFile(id: Int, name: String, age: Int) }