前段时间一直在研究Hive,本周做了一个PPT,算是一个阶段性的总结,也给自己梳理一下思路。另外,这个PPT是用Racket(一种lisp)做的。
用代码做ppt,看起来比较像傻。不过我觉得,做一件事,首先要把它做有意思,这样才有意思。呵呵。
- #lang racket/base
-
-
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
;; Introduction to hive internals.
-
(require slideshow/base
-
slideshow/pict
-
racket/date
-
racket/class
-
racket/draw)
-
-
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
;; Overall settings
-
;; A pict to use behind the main content
-
(define fade-bg
-
(let ((w (+ (* 2 margin) client-w))
-
(h (+ (* 2 margin) client-h))
-
(trans (make-object brush% "white" 'transparent))
-
(inside (make-object brush% "white" 'solid)))
-
(inset (dc (lambda (dc x y)
-
(let ((b (send dc get-brush))
-
(p (send dc get-pen))
-
(draw-one
-
(lambda (i)
-
(send dc draw-rectangle
-
(+ x i) (+ y i)
-
(- w (* 2 i)) (- h (* 2 i))))))
-
(send dc set-brush trans)
-
(color-series
-
dc margin 1
-
(make-object color% "black")
-
(make-object color% "white")
-
draw-one
-
#t #t)
-
(send dc set-brush inside)
-
(draw-one margin)
-
(send dc set-pen p)
-
(send dc set-brush b)))
-
w h 0 0)
-
(- margin))))
-
-
;; a new assembler
-
(current-slide-assembler
-
(lambda (title v-sep content)
-
(ct-superimpose
-
fade-bg
-
(if title
-
(vc-append v-sep
-
;; left-aligns the title:
-
(titlet title)
-
content)
-
content))))
-
-
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
;; Utility functions & constants
-
-
;; item used in outline
-
(define (outline-item . l)
-
(item #:width (* 3/4 (current-para-width)) l))
-
-
;; title string with a sub title string
-
(define (title/sub title subtitle)
-
(string-append title ": " subtitle))
-
-
;; titles used in this talk
-
(define overview "Big Pictures")
-
(define overview-arch "Overall Architecture")
-
(define overview-flow "Overall Control Flow")
-
-
(define compiler "Compiler Internals")
-
(define compiler-overview "Overview")
-
(define compiler-parser "Parser")
-
(define compiler-semantic "Semantic Analyzer")
-
(define compiler-optimizer "Optimizer")
-
(define compiler-task/plan "Physical Plan Gen")
-
-
(define runtime "Runtime Internals")
-
(define runtime-overview "Overview")
-
(define runtime-task "Task")
-
(define runtime-operator "Operator")
-
(define runtime-expression "Expression")
-
-
(define object-model "Object Model")
-
-
(define ideal-diff "Distance to Expection")
-
-
(define references "References")
-
-
;; outline of this talk
-
(define outline
-
(make-outline
-
'overview overview
-
(lambda (tag)
-
(vl-append
-
(outline-item "Background")
-
(outline-item overview-arch)
-
(outline-item overview-flow)))
-
-
'compiler compiler
-
(lambda (tag)
-
(vl-append
-
(outline-item compiler-overview)
-
(outline-item compiler-parser)
-
(outline-item compiler-semantic)
-
(outline-item compiler-optimizer)
-
(outline-item compiler-task/plan)))
-
-
'runtime runtime
-
(lambda (tag)
-
(vl-append
-
(outline-item runtime-overview)
-
(outline-item runtime-task)
-
(outline-item runtime-operator)
-
(outline-item runtime-expression)))
-
-
'object-model object-model
-
#f
-
-
'ideal-diff ideal-diff
-
#f
-
-
'references references
-
#f))
-
-
(define title-font-size
-
(* 3/2 (current-font-size)))
-
-
(define date-str
-
(date->string (current-date)))
-
-
;; java code
-
(define (jcode str)
-
(colorize (it str) "darkblue"))
-
-
;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
;; Slides begins here
-
(slide
-
#:name "Begin"
-
(text "Hive Internals" (current-main-font) title-font-size)
-
(t "liangkun@baidu.com")
-
(t date-str)
-
(blank)
-
(text "Powered by Scheme"
-
(current-main-font)
-
(* 1/3 title-font-size)))
-
-
(outline 'overview)
-
-
(slide
-
#:title "Background"
-
(item "What is Hive")
-
(subitem "data warehouse facillitates on hadoop")
-
(subitem "SQL-like query language & engine")
-
-
'next
-
(item "What dose Hive do")
-
(subitem "enable easy ETL")
-
(subitem "impose structure on data")
-
(subitem "access files on hadoop or other storage(HBase)")
-
(subitem "query execution via MapReduce")
-
-
'next
-
(bit "Hive is heading to a DDBMS!"))
-
-
(slide
-
#:title overview-arch
-
(scale (bitmap "hive-arch.bmp") 0.80))
-
-
(slide
-
#:title (title/sub overview-arch "Source Code")
-
(item #:bullet (bt "cli:") "command line interface")
-
(item #:bullet (bt "hwi:") "web interface")
-
(item #:bullet (bt "service:") "thrift server interface")
-
(item #:bullet (bt "odbc/jdbc:") "database connection interface")
-
(item #:bullet (bt "serde:") "object module & (de)serialization")
-
(item #:bullet (bt "ql:") "compiler & runtime")
-
(item #:bullet (bt "metastore:") "metastore layer")
-
(item #:bullet (bt "contrib:") "third udf/serde libraries"))
-
-
(slide
-
#:title overview-flow
-
'next
-
(item "Initialization")
-
(subitem "command line and config file parsing")
-
(subitem "init logging system")
-
(subitem "start a new session through" (jcode "SessionState"))
-
-
'next
-
(item "Process Loop(for-each HQL command):")
-
(subitem "get a" (jcode "CommandProcessor") "instance from"
-
(jcode "CommandProcessorFactory"))
-
(subitem "run the processor")
-
(subitem "get and display results"))
-
-
(slide
-
#:title (title/sub overview-flow "CommandProcessor")
-
(para "All Processors from" (jcode "CommandProcessorFactory"))
-
(frame
-
(inset
-
(table 3
-
(list (bt "Command") (bt "Processor") (bt "Describe")
-
(t "SET ...") (jcode "SetProcessor") (t "property setting")
-
(t "DFS ...") (jcode "DfsProcessor") (t "hadoop dfs command")
-
(t "ADD ...") (jcode "AddProcessor") (t "add resources")
-
(t "DELETE ...") (jcode "DeleteProcessor") (t "delete resources")
-
(t "other") (jcode "Driver") (t "HQL processing"))
-
lc-superimpose
-
cc-superimpose
-
(* 3/2 gap-size)
-
gap-size)
-
gap-size)))
-
-
(slide
-
#:title overview-flow
-
(scale (bitmap "hive-flow.bmp") 0.55))
-
-
(outline 'compiler)
-
-
(slide
-
#:title (title/sub compiler-overview "Layers")
-
(item "Variable replacement (in driver)")
-
(item "Case Conversion")
-
(item "Parser: HQL to AST")
-
(item "Logical Plan Gen: AST to QueryBlock")
-
(item "Optimization: Logical Plan to Logical Plan")
-
(item "Physical Plan Gen: Logical Plan to QueryPlan"))
-
-
(slide
-
#:title (title/sub compiler-overview "ILs")
-
'next
-
(item "AST" (jcode "ASTNode"))
-
'next
-
(item "Query Block Tree" (jcode "QB"))
-
(subitem "each node is a query")
-
(subitem "subquery as child-node")
-
(subitem "contains lookaside tables(metainfo)")
-
(subitem "contains operator DAG")
-
'next
-
(item "Physical Plan" (jcode "QueryPlan")))
-
-
(slide
-
#:title compiler-parser
-
(item "Front End")
-
-
(item "Antlr Parser Generator")
-
(subitem "syntax definition: ql/parse/Hive.g")
-
(subitem "LL(3) without backtracking")
-
(subitem "actions are constructing tree")
-
-
(item "Imaginary Tokens")
-
(subitem "begin with:" (jcode "TOK_"))
-
(subitem "logical node")
-
(subitem (jcode "^(TOK_SELECT hintClause? SelectList)")))
-
-
(slide
-
#:title compiler-semantic
-
(item "Mid-End & Back-End Driver")
-
(item "Base Class:" (jcode "BaseSemanticAnalyzer"))
-
(item "All Analyzers from" (jcode "SemanticAnanlyzerFactory"))
-
(frame
-
(inset
-
(table 2
-
(list (bt "SemanticAnalyzers") (bt "Describe")
-
(jcode "Explain...") (t "EXPLAIN statements")
-
(jcode "Load...") (t "loading data into a table")
-
(jcode "DDL...") (t "metadata manipulation")
-
(jcode "Function...") (t "create/drop udf")
-
(jcode "SemanticAnanlyzer") (t "other queries"))
-
lc-superimpose
-
cc-superimpose
-
(* 3/2 gap-size)
-
gap-size)
-
gap-size)))
-
-
(slide
-
#:title (title/sub compiler-semantic "Logical Plan Gen 1")
-
'next
-
(item (jcode "doPhase1()") "recursively traverse AST")
-
(subitem "construct lookaside tables: aliasToTable, aliasToSubq,"
-
"clauseToDest, ...")
-
(subitem "put lookaside tables into" (jcode "QB/QBParseInfo"))
-
(subitem "check for semantic errors")
-
-
'next
-
(item (jcode "getMetaData()"))
-
(subitem "for source tables, destination tables")
-
(subitem "put metadata into" (jcode "QB/QBParseInfo")))
-
-
(slide
-
#:title (title/sub compiler-semantic "Logical Plan Gen 2")
-
(item (jcode "genPlan()"))
-
(subitem "recursively called on sub-queries")
-
(subitem "gen" (jcode "TableScanOperator") "for source table")
-
(subitem "gen DAG for lateral view")
-
(subitem "gen JoinPlan if there is a join token")
-
(subitem (jcode "genBodyPlan()") "for other clause"))
-
-
(slide
-
#:title (title/sub compiler-optimizer "Basic Form")
-
(item "Pass")
-
(subitem (jcode "Transform") "interface")
-
(subitem (jcode "ParseContext transform(ParseContext pctx)"))
-
-
'next
-
(item (jcode "Optimizer"))
-
(subitem "organizer: register a sequence of passes")
-
(subitem "driver: apply registered passes in order")
-
(subitem "context:" (jcode "ParseContext") "object"))
-
-
(slide
-
#:title (title/sub compiler-optimizer "Improvement Opportunity")
-
(item "Common Operation Pattern")
-
(subitem "traverses IL in some order")
-
(subitem "apply some action(s) according some rules"
-
"when visiting a node in traversal")
-
-
'next
-
(item "Extract Common Pattern")
-
(subitem "traversal framework"))
-
-
(slide
-
#:title (title/sub compiler-optimizer "Traversal Framework")
-
(item (jcode "Node"))
-
(subitem "how to traverse")
-
-
'next
-
(item (jcode "GraphWalker"))
-
(subitem "traversal order")
-
-
'next
-
(item (jcode "Dispatcher"))
-
(subitem "find best(lowest cost) rule, apply its action")
-
-
'next
-
(item (jcode "Rule") "&" (jcode "NodeProcessor"))
-
(subitem "rules and actions"))
-
-
(slide
-
#:title (title/sub compiler-optimizer "Current Optimizations")
-
(item (jcode "ColumnPruner"))
-
(item (jcode "PartitionPruner"))
-
(item (jcode "SamplePruner"))
-
(item (jcode "PredicatePushDown"))
-
(item (jcode "GroupByOptimizer"))
-
(item (jcode "MapJoinProcessor"))
-
(item (jcode "JoinReorder"))
-
(item "..."))
-
-
(slide
-
#:title compiler-task/plan
-
(item (jcode "genMapRedWorks()"))
-
(subitem "take" (jcode "QB") "as input")
-
(subitem "gen task DAG")
-
(subitem "alse use optimization framework"))
-
-
(outline 'runtime)
-
-
(slide
-
#:title (title/sub runtime-overview "Entities")
-
(item "Tasks")
-
(subitem "may consists of operators DAG")
-
(subitem (jcode "DDLTask") (jcode "MapRedTask"))
-
-
(item "Operators")
-
(subitem "may contains expressions")
-
(subitem (jcode "FilterOperator") (jcode "SelectOperator"))
-
-
(item "Expressions")
-
(subitem "expression tree for specific computation")
-
(subitem "ConstantEvaluator"
-
"GenericFunctionEvaluator"))
-
-
(slide
-
#:title runtime-task
-
(item (jcode "exec/XXXTask") "<-->" (jcode "plan/XXXWork"))
-
(item "Base Class:" (jcode "Task"))
-
(item "All Tasks from" (jcode "TaskFactory"))
-
(frame
-
(inset
-
(table 2
-
(list (bt "Task") (bt "Describe")
-
(jcode "MapRedTask") (t "map or reduce job")
-
(jcode "StatsTask") (t "analyze a table")
-
(jcode "MoveTask") (t "hdfs dir/file move")
-
(jcode "ConditionalTask") (t "conditional execution")
-
(jcode "...") (t "..."))
-
lc-superimpose
-
cc-superimpose
-
(* 3/2 gap-size)
-
gap-size)
-
gap-size)))
-
-
(slide
-
#:title runtime-operator
-
(item (jcode "exec/XXXOperator") "<-->" (jcode "plan/XXXDesc"))
-
(item "Base Class:" (jcode "Operator"))
-
(item "All Operators from" (jcode "OperatorFactory"))
-
(frame
-
(inset
-
(table 2
-
(list (bt "Operator") (bt "Describe")
-
(jcode "FilterOperator") (t "filte rows")
-
(jcode "SelectOperator") (t "projection")
-
(jcode "ReduceSinkOperator") (t "output to reducer")
-
(jcode "FileSinkOperator") (t "output to file")
-
(jcode "...") (t "..."))
-
lc-superimpose
-
cc-superimpose
-
(* 3/2 gap-size)
-
gap-size)
-
gap-size)))
-
-
(slide
-
#:title (title/sub runtime-operator "Interfaces")
-
(item (jcode "initialize()") "call back, called only once")
-
(item (jcode "process()") "call back, called for each row")
-
(item (jcode "close()") "call back, called only once")
-
(item (jcode "startGroup()") "call back, called on new group")
-
(item (jcode "endGroup()") "call back, called on current group end")
-
(item (jcode "isDone()") "call back"))
-
-
(slide
-
#:title runtime-expression
-
(item (jcode "exec/ExprXXXEvaluator") "<-->" (jcode "plan/ExprXXXDesc"))
-
(item "Base Class:" (jcode "ExprNodeEvaluator"))
-
(item "All Operators from" (jcode "ExprNodeEvaluatorFactory"))
-
(frame
-
(inset
-
(table 2
-
(list (bt "ExprEvaluator") (bt "Describe")
-
(jcode "...Constant...") (t "return a constant value")
-
(jcode "...Column...") (t "extract some column")
-
(jcode "...GenericFunc...") (t "function call")
-
(jcode "...Field...") (t "access struct field")
-
(jcode "...Null...") (t "mark `void'"))
-
lc-superimpose
-
cc-superimpose
-
(* 3/2 gap-size)
-
gap-size)
-
gap-size)))
-
-
(slide
-
#:title (title/sub runtime-expression "Interfaces")
-
(item (jcode "ObjectInspector initialize(ObjectInspector)"))
-
(item (jcode "Object evaluate(Object)")))
-
-
(outline 'object-model)
-
-
(slide
-
#:title (title/sub object-model "Overview")
-
(item "Resident in `serde' Directory")
-
(subitem "other than `ql' directory")
-
-
(item "De-Couple From Query Engine")
-
(subitem "de-couple from compiler")
-
(subitem "de-couple from runtime")
-
-
(item "Support Multi-Format")
-
(subitem "multiple on-disk format")
-
(subitem "multiple in memory format"))
-
-
(slide
-
#:title (title/sub object-model "On Disk Format 1")
-
(item "Hive Support Multiple On Disk Format")
-
(subitem "ease of use")
-
(subitem "ease of integration")
-
(subitem "flexibility, better trade off between space/performance"))
-
-
(slide
-
#:title (title/sub object-model "On Disk Format 2")
-
(item "File Format")
-
(subitem "row based:" (jcode "TextFile"))
-
(subitem "column based:" (jcode "RCFile"))
-
(subitem "block based:" (jcode "SequenceFile"))
-
-
(item "Row Format")
-
(subitem "text based:" (jcode "TextFile"))
-
(subitem "binary based")
-
(subitem "customized"))
-
-
(slide
-
#:title (title/sub object-model "In Memory Format 1")
-
(item "Hive Support Multiple In Memeory Format")
-
(subitem "ease of integration")
-
(subitem "different on disk format + efficient loading")
-
-
(item "Key Idea: Delegation")
-
(subitem "data: Object")
-
(subitem "data access: ObjectInspector"))
-
-
(slide
-
#:title (title/sub object-model "In Memory Format 2")
-
(item "Base Interfaces")
-
(subitem (jcode "ObjectInspector"))
-
(subitem (jcode "PrimitiveObjectInspector")
-
(jcode "StructObjectInspector")
-
(jcode "MapObjectInspector")
-
(jcode "ListObjectInspector")
-
(jcode "Union(Struct)ObjectInspector"))
-
-
(item "Example")
-
(subitem (jcode "Integer") "+" (jcode "JavaIntegerObjectInspector"))
-
(subitem (jcode "IntWritable") "+"
-
(jcode "WritableIntegerObjectInspector")))
-
-
(outline 'ideal-diff)
-
-
(slide
-
#:title ideal-diff
-
(item "Differences")
-
(subitem "hive --> DDBMS")
-
(subitem "expection --> language")
-
-
(item "Commons")
-
(subitem "a SQL-like sub-language")
-
(subitem "compiler and runtime")
-
-
(item "Problem")
-
(subitem "metastore")
-
(subitem "name binding & abstraction"))
-
-
(outline 'references)
-
-
(slide
-
#:title references
-
(item "https://cwiki.apache.org/"
-
"confluence/display/Hive/Presentations")
-
(item "source code 0.7.1")
-
(blank)
-
(text "Thank you!" (current-main-font) title-font-size))
阅读(2907) | 评论(0) | 转发(0) |