%spark

//if you created the smallest BDC environment (2 OCPU) and just finished the previous lab, this step will take 6 minutes or so.
//this is because when you ran the last lab, you started a hive/tez session which will remain open for a few minutes and will use/block resources needed by spark
//after a few minutes, hive/tez will close its session and this spark code will run.  You could go to the Jobs tab and manually abort the hive session, or just relax and wait.



//a previous tutorial placed the csv file into your Object Store citibike container
//notice the use of the swift://CONTAINER.default/ syntax
val Container = "bdcsce"
val Directory = "citibike"

sqlContext.setConf("spark.sql.shuffle.partitions", "4")

var df: org.apache.spark.sql.DataFrame = null

if( "swift://bdcsce.default".contains("swift")  ){
         println("Running on OCI-C");
   //We will use the bdfs (alluxio) cached file system to access our object store data...
   df = sqlContext.read.format("com.databricks.spark.csv").option("header", "true").option("inferSchema","true").load("swift://bdcsce.default/"+Directory+"/raw/201612-citibike-tripdata.csv")
} else {
         println("Running on OCI");
   df = sqlContext.read.format("com.databricks.spark.csv").option("header", "true").load("swift://bdcsce.default/"+Directory+"/raw/201612-citibike-tripdata.csv")
}


// If you get this error message:
// java.lang.IllegalStateException: Cannot call methods on a stopped SparkContext.
// Then go to the Settings tab, then click on Notebook.  Then restart the Notebook.  This will restart your SparkContext


println("Here is the schema detected from the CSV")
df.printSchema()
println("..")

println("# of rows: %s".format(
  df.count() 
)) 
println("..")

df.createOrReplaceTempView("bike_trips_temp")
println("done")