SparkSQL之DataFrame基本API操作
废话不多说直接上代码和图解:
import org.apache.spark.sql.SparkSession
/**
* Created by 李国辉 on 2018/10/9.
* DataFrame API基本操作
*/
object DataFrameApp {
def main(args: Array[String]) {
val spark = SparkSession.builder().appName("DataFrameApp").master("local[2]").getOrCreate()
val people = spark.read.format("json").load("people.json")
//输出dataframe对应的schema信息 以树的形式
people.printSchema()
结果显示:
//输出数据集的前20条记录
people.show()
结果显示:
// 查询name列的数据 等同于 select name from table
people.select(“name”).show()
//把列的age加10 等同于 select name,age+10 from table
people.select(people.col(“name”),people.col(“age”)+10).show()
// .as() 为列取别名
people.select(people.col("name"),(people.col("age")+10).as("age2")).show()
//根据某一列的值进行过滤: select * from table where age > 19
people.filter(people.col(“age”)>24).show()
> //根据某一列进行分组,然后再进行聚合操作: select age,count(1) from table group by age
> people.groupBy("age").count().show()
//关闭
spark.stop()
}
}
以上就是常用的DataFrame的API.