- #lang racket/base
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Introduction to hive internals.
(require slideshow/base
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Overall settings
;; A pict to use behind the main content
(define fade-bg
(let ((w (+ (* 2 margin) client-w))
(h (+ (* 2 margin) client-h))
(trans (make-object brush% "white" 'transparent))
(inside (make-object brush% "white" 'solid)))
(inset (dc (lambda (dc x y)
(let ((b (send dc get-brush))
(p (send dc get-pen))
(lambda (i)
(send dc draw-rectangle
(+ x i) (+ y i)
(- w (* 2 i)) (- h (* 2 i))))))
(send dc set-brush trans)
dc margin 1
(make-object color% "black")
(make-object color% "white")
#t #t)
(send dc set-brush inside)
(draw-one margin)
(send dc set-pen p)
(send dc set-brush b)))
w h 0 0)
(- margin))))
;; a new assembler
(lambda (title v-sep content)
(if title
(vc-append v-sep
;; left-aligns the title:
(titlet title)
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Utility functions & constants
;; item used in outline
(define (outline-item . l)
(item #:width (* 3/4 (current-para-width)) l))
;; title string with a sub title string
(define (title/sub title subtitle)
(string-append title ": " subtitle))
;; titles used in this talk
(define overview "Big Pictures")
(define overview-arch "Overall Architecture")
(define overview-flow "Overall Control Flow")
(define compiler "Compiler Internals")
(define compiler-overview "Overview")
(define compiler-parser "Parser")
(define compiler-semantic "Semantic Analyzer")
(define compiler-optimizer "Optimizer")
(define compiler-task/plan "Physical Plan Gen")
(define runtime "Runtime Internals")
(define runtime-overview "Overview")
(define runtime-task "Task")
(define runtime-operator "Operator")
(define runtime-expression "Expression")
(define object-model "Object Model")
(define ideal-diff "Distance to Expection")
(define references "References")
;; outline of this talk
(define outline
'overview overview
(lambda (tag)
(outline-item "Background")
(outline-item overview-arch)
(outline-item overview-flow)))
'compiler compiler
(lambda (tag)
(outline-item compiler-overview)
(outline-item compiler-parser)
(outline-item compiler-semantic)
(outline-item compiler-optimizer)
(outline-item compiler-task/plan)))
'runtime runtime
(lambda (tag)
(outline-item runtime-overview)
(outline-item runtime-task)
(outline-item runtime-operator)
(outline-item runtime-expression)))
'object-model object-model
'ideal-diff ideal-diff
'references references
(define title-font-size
(* 3/2 (current-font-size)))
(define date-str
(date->string (current-date)))
;; java code
(define (jcode str)
(colorize (it str) "darkblue"))
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Slides begins here
#:name "Begin"
(text "Hive Internals" (current-main-font) title-font-size)
(t "liangkun@baidu.com")
(t date-str)
(text "Powered by Scheme"
(* 1/3 title-font-size)))
(outline 'overview)
#:title "Background"
(item "What is Hive")
(subitem "data warehouse facillitates on hadoop")
(subitem "SQL-like query language & engine")
(item "What dose Hive do")
(subitem "enable easy ETL")
(subitem "impose structure on data")
(subitem "access files on hadoop or other storage(HBase)")
(subitem "query execution via MapReduce")
(bit "Hive is heading to a DDBMS!"))
#:title overview-arch
(scale (bitmap "hive-arch.bmp") 0.80))
#:title (title/sub overview-arch "Source Code")
(item #:bullet (bt "cli:") "command line interface")
(item #:bullet (bt "hwi:") "web interface")
(item #:bullet (bt "service:") "thrift server interface")
(item #:bullet (bt "odbc/jdbc:") "database connection interface")
(item #:bullet (bt "serde:") "object module & (de)serialization")
(item #:bullet (bt "ql:") "compiler & runtime")
(item #:bullet (bt "metastore:") "metastore layer")
(item #:bullet (bt "contrib:") "third udf/serde libraries"))
#:title overview-flow
(item "Initialization")
(subitem "command line and config file parsing")
(subitem "init logging system")
(subitem "start a new session through" (jcode "SessionState"))
(item "Process Loop(for-each HQL command):")
(subitem "get a" (jcode "CommandProcessor") "instance from"
(jcode "CommandProcessorFactory"))
(subitem "run the processor")
(subitem "get and display results"))
#:title (title/sub overview-flow "CommandProcessor")
(para "All Processors from" (jcode "CommandProcessorFactory"))
(table 3
(list (bt "Command") (bt "Processor") (bt "Describe")
(t "SET ...") (jcode "SetProcessor") (t "property setting")
(t "DFS ...") (jcode "DfsProcessor") (t "hadoop dfs command")
(t "ADD ...") (jcode "AddProcessor") (t "add resources")
(t "DELETE ...") (jcode "DeleteProcessor") (t "delete resources")
(t "other") (jcode "Driver") (t "HQL processing"))
(* 3/2 gap-size)
#:title overview-flow
(scale (bitmap "hive-flow.bmp") 0.55))
(outline 'compiler)
#:title (title/sub compiler-overview "Layers")
(item "Variable replacement (in driver)")
(item "Case Conversion")
(item "Parser: HQL to AST")
(item "Logical Plan Gen: AST to QueryBlock")
(item "Optimization: Logical Plan to Logical Plan")
(item "Physical Plan Gen: Logical Plan to QueryPlan"))
#:title (title/sub compiler-overview "ILs")
(item "AST" (jcode "ASTNode"))
(item "Query Block Tree" (jcode "QB"))
(subitem "each node is a query")
(subitem "subquery as child-node")
(subitem "contains lookaside tables(metainfo)")
(subitem "contains operator DAG")
(item "Physical Plan" (jcode "QueryPlan")))
#:title compiler-parser
(item "Front End")
(item "Antlr Parser Generator")
(subitem "syntax definition: ql/parse/Hive.g")
(subitem "LL(3) without backtracking")
(subitem "actions are constructing tree")
(item "Imaginary Tokens")
(subitem "begin with:" (jcode "TOK_"))
(subitem "logical node")
(subitem (jcode "^(TOK_SELECT hintClause? SelectList)")))
#:title compiler-semantic
(item "Mid-End & Back-End Driver")
(item "Base Class:" (jcode "BaseSemanticAnalyzer"))
(item "All Analyzers from" (jcode "SemanticAnanlyzerFactory"))
(table 2
(list (bt "SemanticAnalyzers") (bt "Describe")
(jcode "Explain...") (t "EXPLAIN statements")
(jcode "Load...") (t "loading data into a table")
(jcode "DDL...") (t "metadata manipulation")
(jcode "Function...") (t "create/drop udf")
(jcode "SemanticAnanlyzer") (t "other queries"))
(* 3/2 gap-size)
#:title (title/sub compiler-semantic "Logical Plan Gen 1")
(item (jcode "doPhase1()") "recursively traverse AST")
(subitem "construct lookaside tables: aliasToTable, aliasToSubq,"
"clauseToDest, ...")
(subitem "put lookaside tables into" (jcode "QB/QBParseInfo"))
(subitem "check for semantic errors")
(item (jcode "getMetaData()"))
(subitem "for source tables, destination tables")
(subitem "put metadata into" (jcode "QB/QBParseInfo")))
#:title (title/sub compiler-semantic "Logical Plan Gen 2")
(item (jcode "genPlan()"))
(subitem "recursively called on sub-queries")
(subitem "gen" (jcode "TableScanOperator") "for source table")
(subitem "gen DAG for lateral view")
(subitem "gen JoinPlan if there is a join token")
(subitem (jcode "genBodyPlan()") "for other clause"))
#:title (title/sub compiler-optimizer "Basic Form")
(item "Pass")
(subitem (jcode "Transform") "interface")
(subitem (jcode "ParseContext transform(ParseContext pctx)"))
(item (jcode "Optimizer"))
(subitem "organizer: register a sequence of passes")
(subitem "driver: apply registered passes in order")
(subitem "context:" (jcode "ParseContext") "object"))
#:title (title/sub compiler-optimizer "Improvement Opportunity")
(item "Common Operation Pattern")
(subitem "traverses IL in some order")
(subitem "apply some action(s) according some rules"
"when visiting a node in traversal")
(item "Extract Common Pattern")
(subitem "traversal framework"))
#:title (title/sub compiler-optimizer "Traversal Framework")
(item (jcode "Node"))
(subitem "how to traverse")
(item (jcode "GraphWalker"))
(subitem "traversal order")
(item (jcode "Dispatcher"))
(subitem "find best(lowest cost) rule, apply its action")
(item (jcode "Rule") "&" (jcode "NodeProcessor"))
(subitem "rules and actions"))
#:title (title/sub compiler-optimizer "Current Optimizations")
(item (jcode "ColumnPruner"))
(item (jcode "PartitionPruner"))
(item (jcode "SamplePruner"))
(item (jcode "PredicatePushDown"))
(item (jcode "GroupByOptimizer"))
(item (jcode "MapJoinProcessor"))
(item (jcode "JoinReorder"))
(item "..."))
#:title compiler-task/plan
(item (jcode "genMapRedWorks()"))
(subitem "take" (jcode "QB") "as input")
(subitem "gen task DAG")
(subitem "alse use optimization framework"))
(outline 'runtime)
#:title (title/sub runtime-overview "Entities")
(item "Tasks")
(subitem "may consists of operators DAG")
(subitem (jcode "DDLTask") (jcode "MapRedTask"))
(item "Operators")
(subitem "may contains expressions")
(subitem (jcode "FilterOperator") (jcode "SelectOperator"))
(item "Expressions")
(subitem "expression tree for specific computation")
(subitem "ConstantEvaluator"
#:title runtime-task
(item (jcode "exec/XXXTask") "<-->" (jcode "plan/XXXWork"))
(item "Base Class:" (jcode "Task"))
(item "All Tasks from" (jcode "TaskFactory"))
(table 2
(list (bt "Task") (bt "Describe")
(jcode "MapRedTask") (t "map or reduce job")
(jcode "StatsTask") (t "analyze a table")
(jcode "MoveTask") (t "hdfs dir/file move")
(jcode "ConditionalTask") (t "conditional execution")
(jcode "...") (t "..."))
(* 3/2 gap-size)
#:title runtime-operator
(item (jcode "exec/XXXOperator") "<-->" (jcode "plan/XXXDesc"))
(item "Base Class:" (jcode "Operator"))
(item "All Operators from" (jcode "OperatorFactory"))
(table 2
(list (bt "Operator") (bt "Describe")
(jcode "FilterOperator") (t "filte rows")
(jcode "SelectOperator") (t "projection")
(jcode "ReduceSinkOperator") (t "output to reducer")
(jcode "FileSinkOperator") (t "output to file")
(jcode "...") (t "..."))
(* 3/2 gap-size)
#:title (title/sub runtime-operator "Interfaces")
(item (jcode "initialize()") "call back, called only once")
(item (jcode "process()") "call back, called for each row")
(item (jcode "close()") "call back, called only once")
(item (jcode "startGroup()") "call back, called on new group")
(item (jcode "endGroup()") "call back, called on current group end")
(item (jcode "isDone()") "call back"))
#:title runtime-expression
(item (jcode "exec/ExprXXXEvaluator") "<-->" (jcode "plan/ExprXXXDesc"))
(item "Base Class:" (jcode "ExprNodeEvaluator"))
(item "All Operators from" (jcode "ExprNodeEvaluatorFactory"))
(table 2
(list (bt "ExprEvaluator") (bt "Describe")
(jcode "...Constant...") (t "return a constant value")
(jcode "...Column...") (t "extract some column")
(jcode "...GenericFunc...") (t "function call")
(jcode "...Field...") (t "access struct field")
(jcode "...Null...") (t "mark `void'"))
(* 3/2 gap-size)
#:title (title/sub runtime-expression "Interfaces")
(item (jcode "ObjectInspector initialize(ObjectInspector)"))
(item (jcode "Object evaluate(Object)")))
(outline 'object-model)
#:title (title/sub object-model "Overview")
(item "Resident in `serde' Directory")
(subitem "other than `ql' directory")
(item "De-Couple From Query Engine")
(subitem "de-couple from compiler")
(subitem "de-couple from runtime")
(item "Support Multi-Format")
(subitem "multiple on-disk format")
(subitem "multiple in memory format"))
#:title (title/sub object-model "On Disk Format 1")
(item "Hive Support Multiple On Disk Format")
(subitem "ease of use")
(subitem "ease of integration")
(subitem "flexibility, better trade off between space/performance"))
#:title (title/sub object-model "On Disk Format 2")
(item "File Format")
(subitem "row based:" (jcode "TextFile"))
(subitem "column based:" (jcode "RCFile"))
(subitem "block based:" (jcode "SequenceFile"))
(item "Row Format")
(subitem "text based:" (jcode "TextFile"))
(subitem "binary based")
(subitem "customized"))
#:title (title/sub object-model "In Memory Format 1")
(item "Hive Support Multiple In Memeory Format")
(subitem "ease of integration")
(subitem "different on disk format + efficient loading")
(item "Key Idea: Delegation")
(subitem "data: Object")
(subitem "data access: ObjectInspector"))
#:title (title/sub object-model "In Memory Format 2")
(item "Base Interfaces")
(subitem (jcode "ObjectInspector"))
(subitem (jcode "PrimitiveObjectInspector")
(jcode "StructObjectInspector")
(jcode "MapObjectInspector")
(jcode "ListObjectInspector")
(jcode "Union(Struct)ObjectInspector"))
(item "Example")
(subitem (jcode "Integer") "+" (jcode "JavaIntegerObjectInspector"))
(subitem (jcode "IntWritable") "+"
(jcode "WritableIntegerObjectInspector")))
(outline 'ideal-diff)
#:title ideal-diff
(item "Differences")
(subitem "hive --> DDBMS")
(subitem "expection --> language")
(item "Commons")
(subitem "a SQL-like sub-language")
(subitem "compiler and runtime")
(item "Problem")
(subitem "metastore")
(subitem "name binding & abstraction"))
(outline 'references)
#:title references
(item "https://cwiki.apache.org/"
(item "source code 0.7.1")
(text "Thank you!" (current-main-font) title-font-size))
