Chinaunix首页 | 论坛 | 博客
  • 博客访问: 959756
  • 博文数量: 104
  • 博客积分: 3715
  • 博客等级: 中校
  • 技术积分: 1868
  • 用 户 组: 普通用户
  • 注册时间: 2006-04-30 08:38
文章分类

全部博文(104)

文章存档

2013年(1)

2012年(9)

2011年(41)

2010年(3)

2009年(3)

2008年(47)

分类: 云计算

2011-12-07 09:27:36

前段时间一直在研究Hive,本周做了一个PPT,算是一个阶段性的总结,也给自己梳理一下思路。另外,这个PPT是用Racket(一种lisp)做的。

用代码做ppt,看起来比较像傻。不过我觉得,做一件事,首先要把它做有意思,这样才有意思。呵呵。

好了,不说废话,下面就是啦。如果想要看的话,到racket-lang.org下载racket安装,打开hive-internals.rkt hive-internals.rar   文件,运行就可以了(PS,附件的两个图片 hive-internals-img.rar   要和hive-internals.rkt在同一个目录中,否则可能找不到图片)。


  1. #lang racket/base

  2. ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  3. ;; Introduction to hive internals.
  4. (require slideshow/base
  5.          slideshow/pict
  6.          racket/date
  7.          racket/class
  8.          racket/draw)

  9. ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  10. ;; Overall settings
  11. ;; A pict to use behind the main content
  12. (define fade-bg
  13.   (let ((w (+ (* 2 margin) client-w))
  14.         (h (+ (* 2 margin) client-h))
  15.         (trans (make-object brush% "white" 'transparent))
  16.         (inside (make-object brush% "white" 'solid)))
  17.     (inset (dc (lambda (dc x y)
  18.                  (let ((b (send dc get-brush))
  19.                        (p (send dc get-pen))
  20.                        (draw-one
  21.                         (lambda (i)
  22.                           (send dc draw-rectangle
  23.                                 (+ x i) (+ y i)
  24.                                 (- w (* 2 i)) (- h (* 2 i))))))
  25.                    (send dc set-brush trans)
  26.                    (color-series
  27.                     dc margin 1
  28.                     (make-object color% "black")
  29.                     (make-object color% "white")
  30.                     draw-one
  31.                     #t #t)
  32.                    (send dc set-brush inside)
  33.                    (draw-one margin)
  34.                    (send dc set-pen p)
  35.                    (send dc set-brush b)))
  36.                w h 0 0)
  37.            (- margin))))

  38. ;; a new assembler
  39. (current-slide-assembler
  40.  (lambda (title v-sep content)
  41.    (ct-superimpose
  42.     fade-bg
  43.     (if title
  44.         (vc-append v-sep
  45.                    ;; left-aligns the title:
  46.                    (titlet title)
  47.                    content)
  48.         content))))

  49. ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  50. ;; Utility functions & constants

  51. ;; item used in outline
  52. (define (outline-item . l)
  53.   (item #:width (* 3/4 (current-para-width)) l))

  54. ;; title string with a sub title string
  55. (define (title/sub title subtitle)
  56.   (string-append title ": " subtitle))

  57. ;; titles used in this talk
  58. (define overview "Big Pictures")
  59. (define overview-arch "Overall Architecture")
  60. (define overview-flow "Overall Control Flow")

  61. (define compiler "Compiler Internals")
  62. (define compiler-overview "Overview")
  63. (define compiler-parser "Parser")
  64. (define compiler-semantic "Semantic Analyzer")
  65. (define compiler-optimizer "Optimizer")
  66. (define compiler-task/plan "Physical Plan Gen")

  67. (define runtime "Runtime Internals")
  68. (define runtime-overview "Overview")
  69. (define runtime-task "Task")
  70. (define runtime-operator "Operator")
  71. (define runtime-expression "Expression")

  72. (define object-model "Object Model")

  73. (define ideal-diff "Distance to Expection")

  74. (define references "References")

  75. ;; outline of this talk
  76. (define outline
  77.   (make-outline
  78.     'overview overview
  79.     (lambda (tag)
  80.       (vl-append
  81.         (outline-item "Background")
  82.         (outline-item overview-arch)
  83.         (outline-item overview-flow)))

  84.     'compiler compiler
  85.     (lambda (tag)
  86.       (vl-append
  87.         (outline-item compiler-overview)
  88.         (outline-item compiler-parser)
  89.         (outline-item compiler-semantic)
  90.         (outline-item compiler-optimizer)
  91.         (outline-item compiler-task/plan)))

  92.     'runtime runtime
  93.     (lambda (tag)
  94.       (vl-append
  95.         (outline-item runtime-overview)
  96.         (outline-item runtime-task)
  97.         (outline-item runtime-operator)
  98.         (outline-item runtime-expression)))

  99.     'object-model object-model
  100.     #f

  101.     'ideal-diff ideal-diff
  102.     #f

  103.     'references references
  104.     #f))

  105. (define title-font-size
  106.   (* 3/2 (current-font-size)))

  107. (define date-str
  108.   (date->string (current-date)))

  109. ;; java code
  110. (define (jcode str)
  111.   (colorize (it str) "darkblue"))

  112. ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  113. ;; Slides begins here
  114. (slide
  115.   #:name "Begin"
  116.   (text "Hive Internals" (current-main-font) title-font-size)
  117.   (t "liangkun@baidu.com")
  118.   (t date-str)
  119.   (blank)
  120.   (text "Powered by Scheme"
  121.         (current-main-font)
  122.         (* 1/3 title-font-size)))

  123. (outline 'overview)

  124. (slide
  125.   #:title "Background"
  126.   (item "What is Hive")
  127.   (subitem "data warehouse facillitates on hadoop")
  128.   (subitem "SQL-like query language & engine")

  129.   'next
  130.   (item "What dose Hive do")
  131.   (subitem "enable easy ETL")
  132.   (subitem "impose structure on data")
  133.   (subitem "access files on hadoop or other storage(HBase)")
  134.   (subitem "query execution via MapReduce")

  135.   'next
  136.   (bit "Hive is heading to a DDBMS!"))

  137. (slide
  138.   #:title overview-arch
  139.   (scale (bitmap "hive-arch.bmp") 0.80))

  140. (slide
  141.   #:title (title/sub overview-arch "Source Code")
  142.   (item #:bullet (bt "cli:") "command line interface")
  143.   (item #:bullet (bt "hwi:") "web interface")
  144.   (item #:bullet (bt "service:") "thrift server interface")
  145.   (item #:bullet (bt "odbc/jdbc:") "database connection interface")
  146.   (item #:bullet (bt "serde:") "object module & (de)serialization")
  147.   (item #:bullet (bt "ql:") "compiler & runtime")
  148.   (item #:bullet (bt "metastore:") "metastore layer")
  149.   (item #:bullet (bt "contrib:") "third udf/serde libraries"))

  150. (slide
  151.   #:title overview-flow
  152.   'next
  153.   (item "Initialization")
  154.   (subitem "command line and config file parsing")
  155.   (subitem "init logging system")
  156.   (subitem "start a new session through" (jcode "SessionState"))

  157.   'next
  158.   (item "Process Loop(for-each HQL command):")
  159.   (subitem "get a" (jcode "CommandProcessor") "instance from"
  160.            (jcode "CommandProcessorFactory"))
  161.   (subitem "run the processor")
  162.   (subitem "get and display results"))

  163. (slide
  164.   #:title (title/sub overview-flow "CommandProcessor")
  165.   (para "All Processors from" (jcode "CommandProcessorFactory"))
  166.   (frame
  167.     (inset
  168.       (table 3
  169.              (list (bt "Command") (bt "Processor") (bt "Describe")
  170.                    (t "SET ...") (jcode "SetProcessor") (t "property setting")
  171.                    (t "DFS ...") (jcode "DfsProcessor") (t "hadoop dfs command")
  172.                    (t "ADD ...") (jcode "AddProcessor") (t "add resources")
  173.                    (t "DELETE ...") (jcode "DeleteProcessor") (t "delete resources")
  174.                    (t "other") (jcode "Driver") (t "HQL processing"))
  175.              lc-superimpose
  176.              cc-superimpose
  177.              (* 3/2 gap-size)
  178.              gap-size)
  179.       gap-size)))

  180. (slide
  181.   #:title overview-flow
  182.   (scale (bitmap "hive-flow.bmp") 0.55))

  183. (outline 'compiler)

  184. (slide
  185.   #:title (title/sub compiler-overview "Layers")
  186.   (item "Variable replacement (in driver)")
  187.   (item "Case Conversion")
  188.   (item "Parser: HQL to AST")
  189.   (item "Logical Plan Gen: AST to QueryBlock")
  190.   (item "Optimization: Logical Plan to Logical Plan")
  191.   (item "Physical Plan Gen: Logical Plan to QueryPlan"))

  192. (slide
  193.   #:title (title/sub compiler-overview "ILs")
  194.   'next
  195.   (item "AST" (jcode "ASTNode"))
  196.   'next
  197.   (item "Query Block Tree" (jcode "QB"))
  198.   (subitem "each node is a query")
  199.   (subitem "subquery as child-node")
  200.   (subitem "contains lookaside tables(metainfo)")
  201.   (subitem "contains operator DAG")
  202.   'next
  203.   (item "Physical Plan" (jcode "QueryPlan")))

  204. (slide
  205.   #:title compiler-parser
  206.   (item "Front End")

  207.   (item "Antlr Parser Generator")
  208.   (subitem "syntax definition: ql/parse/Hive.g")
  209.   (subitem "LL(3) without backtracking")
  210.   (subitem "actions are constructing tree")

  211.   (item "Imaginary Tokens")
  212.   (subitem "begin with:" (jcode "TOK_"))
  213.   (subitem "logical node")
  214.   (subitem (jcode "^(TOK_SELECT hintClause? SelectList)")))

  215. (slide
  216.   #:title compiler-semantic
  217.   (item "Mid-End & Back-End Driver")
  218.   (item "Base Class:" (jcode "BaseSemanticAnalyzer"))
  219.   (item "All Analyzers from" (jcode "SemanticAnanlyzerFactory"))
  220.   (frame
  221.     (inset
  222.       (table 2
  223.              (list (bt "SemanticAnalyzers") (bt "Describe")
  224.                    (jcode "Explain...") (t "EXPLAIN statements")
  225.                    (jcode "Load...") (t "loading data into a table")
  226.                    (jcode "DDL...") (t "metadata manipulation")
  227.                    (jcode "Function...") (t "create/drop udf")
  228.                    (jcode "SemanticAnanlyzer") (t "other queries"))
  229.              lc-superimpose
  230.              cc-superimpose
  231.              (* 3/2 gap-size)
  232.              gap-size)
  233.       gap-size)))

  234. (slide
  235.   #:title (title/sub compiler-semantic "Logical Plan Gen 1")
  236.   'next
  237.   (item (jcode "doPhase1()") "recursively traverse AST")
  238.   (subitem "construct lookaside tables: aliasToTable, aliasToSubq,"
  239.            "clauseToDest, ...")
  240.   (subitem "put lookaside tables into" (jcode "QB/QBParseInfo"))
  241.   (subitem "check for semantic errors")

  242.   'next
  243.   (item (jcode "getMetaData()"))
  244.   (subitem "for source tables, destination tables")
  245.   (subitem "put metadata into" (jcode "QB/QBParseInfo")))

  246. (slide
  247.   #:title (title/sub compiler-semantic "Logical Plan Gen 2")
  248.   (item (jcode "genPlan()"))
  249.   (subitem "recursively called on sub-queries")
  250.   (subitem "gen" (jcode "TableScanOperator") "for source table")
  251.   (subitem "gen DAG for lateral view")
  252.   (subitem "gen JoinPlan if there is a join token")
  253.   (subitem (jcode "genBodyPlan()") "for other clause"))

  254. (slide
  255.   #:title (title/sub compiler-optimizer "Basic Form")
  256.   (item "Pass")
  257.   (subitem (jcode "Transform") "interface")
  258.   (subitem (jcode "ParseContext transform(ParseContext pctx)"))

  259.   'next
  260.   (item (jcode "Optimizer"))
  261.   (subitem "organizer: register a sequence of passes")
  262.   (subitem "driver: apply registered passes in order")
  263.   (subitem "context:" (jcode "ParseContext") "object"))

  264. (slide
  265.   #:title (title/sub compiler-optimizer "Improvement Opportunity")
  266.   (item "Common Operation Pattern")
  267.   (subitem "traverses IL in some order")
  268.   (subitem "apply some action(s) according some rules"
  269.            "when visiting a node in traversal")

  270.   'next
  271.   (item "Extract Common Pattern")
  272.   (subitem "traversal framework"))

  273. (slide
  274.   #:title (title/sub compiler-optimizer "Traversal Framework")
  275.   (item (jcode "Node"))
  276.   (subitem "how to traverse")

  277.   'next
  278.   (item (jcode "GraphWalker"))
  279.   (subitem "traversal order")

  280.   'next
  281.   (item (jcode "Dispatcher"))
  282.   (subitem "find best(lowest cost) rule, apply its action")

  283.   'next
  284.   (item (jcode "Rule") "&" (jcode "NodeProcessor"))
  285.   (subitem "rules and actions"))

  286. (slide
  287.   #:title (title/sub compiler-optimizer "Current Optimizations")
  288.   (item (jcode "ColumnPruner"))
  289.   (item (jcode "PartitionPruner"))
  290.   (item (jcode "SamplePruner"))
  291.   (item (jcode "PredicatePushDown"))
  292.   (item (jcode "GroupByOptimizer"))
  293.   (item (jcode "MapJoinProcessor"))
  294.   (item (jcode "JoinReorder"))
  295.   (item "..."))

  296. (slide
  297.   #:title compiler-task/plan
  298.   (item (jcode "genMapRedWorks()"))
  299.   (subitem "take" (jcode "QB") "as input")
  300.   (subitem "gen task DAG")
  301.   (subitem "alse use optimization framework"))

  302. (outline 'runtime)

  303. (slide
  304.   #:title (title/sub runtime-overview "Entities")
  305.   (item "Tasks")
  306.   (subitem "may consists of operators DAG")
  307.   (subitem (jcode "DDLTask") (jcode "MapRedTask"))

  308.   (item "Operators")
  309.   (subitem "may contains expressions")
  310.   (subitem (jcode "FilterOperator") (jcode "SelectOperator"))

  311.   (item "Expressions")
  312.   (subitem "expression tree for specific computation")
  313.   (subitem "ConstantEvaluator"
  314.            "GenericFunctionEvaluator"))

  315. (slide
  316.   #:title runtime-task
  317.   (item (jcode "exec/XXXTask") "<-->" (jcode "plan/XXXWork"))
  318.   (item "Base Class:" (jcode "Task"))
  319.   (item "All Tasks from" (jcode "TaskFactory"))
  320.   (frame
  321.     (inset
  322.       (table 2
  323.              (list (bt "Task") (bt "Describe")
  324.                    (jcode "MapRedTask") (t "map or reduce job")
  325.                    (jcode "StatsTask") (t "analyze a table")
  326.                    (jcode "MoveTask") (t "hdfs dir/file move")
  327.                    (jcode "ConditionalTask") (t "conditional execution")
  328.                    (jcode "...") (t "..."))
  329.              lc-superimpose
  330.              cc-superimpose
  331.              (* 3/2 gap-size)
  332.              gap-size)
  333.       gap-size)))

  334. (slide
  335.   #:title runtime-operator
  336.   (item (jcode "exec/XXXOperator") "<-->" (jcode "plan/XXXDesc"))
  337.   (item "Base Class:" (jcode "Operator"))
  338.   (item "All Operators from" (jcode "OperatorFactory"))
  339.   (frame
  340.     (inset
  341.       (table 2
  342.              (list (bt "Operator") (bt "Describe")
  343.                    (jcode "FilterOperator") (t "filte rows")
  344.                    (jcode "SelectOperator") (t "projection")
  345.                    (jcode "ReduceSinkOperator") (t "output to reducer")
  346.                    (jcode "FileSinkOperator") (t "output to file")
  347.                    (jcode "...") (t "..."))
  348.              lc-superimpose
  349.              cc-superimpose
  350.              (* 3/2 gap-size)
  351.              gap-size)
  352.       gap-size)))

  353. (slide
  354.   #:title (title/sub runtime-operator "Interfaces")
  355.   (item (jcode "initialize()") "call back, called only once")
  356.   (item (jcode "process()") "call back, called for each row")
  357.   (item (jcode "close()") "call back, called only once")
  358.   (item (jcode "startGroup()") "call back, called on new group")
  359.   (item (jcode "endGroup()") "call back, called on current group end")
  360.   (item (jcode "isDone()") "call back"))

  361. (slide
  362.   #:title runtime-expression
  363.   (item (jcode "exec/ExprXXXEvaluator") "<-->" (jcode "plan/ExprXXXDesc"))
  364.   (item "Base Class:" (jcode "ExprNodeEvaluator"))
  365.   (item "All Operators from" (jcode "ExprNodeEvaluatorFactory"))
  366.   (frame
  367.     (inset
  368.       (table 2
  369.              (list (bt "ExprEvaluator") (bt "Describe")
  370.                    (jcode "...Constant...") (t "return a constant value")
  371.                    (jcode "...Column...") (t "extract some column")
  372.                    (jcode "...GenericFunc...") (t "function call")
  373.                    (jcode "...Field...") (t "access struct field")
  374.                    (jcode "...Null...") (t "mark `void'"))
  375.              lc-superimpose
  376.              cc-superimpose
  377.              (* 3/2 gap-size)
  378.              gap-size)
  379.       gap-size)))

  380. (slide
  381.   #:title (title/sub runtime-expression "Interfaces")
  382.   (item (jcode "ObjectInspector initialize(ObjectInspector)"))
  383.   (item (jcode "Object evaluate(Object)")))

  384. (outline 'object-model)

  385. (slide
  386.   #:title (title/sub object-model "Overview")
  387.   (item "Resident in `serde' Directory")
  388.   (subitem "other than `ql' directory")

  389.   (item "De-Couple From Query Engine")
  390.   (subitem "de-couple from compiler")
  391.   (subitem "de-couple from runtime")

  392.   (item "Support Multi-Format")
  393.   (subitem "multiple on-disk format")
  394.   (subitem "multiple in memory format"))

  395. (slide
  396.   #:title (title/sub object-model "On Disk Format 1")
  397.   (item "Hive Support Multiple On Disk Format")
  398.   (subitem "ease of use")
  399.   (subitem "ease of integration")
  400.   (subitem "flexibility, better trade off between space/performance"))

  401. (slide
  402.   #:title (title/sub object-model "On Disk Format 2")
  403.   (item "File Format")
  404.   (subitem "row based:" (jcode "TextFile"))
  405.   (subitem "column based:" (jcode "RCFile"))
  406.   (subitem "block based:" (jcode "SequenceFile"))

  407.   (item "Row Format")
  408.   (subitem "text based:" (jcode "TextFile"))
  409.   (subitem "binary based")
  410.   (subitem "customized"))

  411. (slide
  412.   #:title (title/sub object-model "In Memory Format 1")
  413.   (item "Hive Support Multiple In Memeory Format")
  414.   (subitem "ease of integration")
  415.   (subitem "different on disk format + efficient loading")

  416.   (item "Key Idea: Delegation")
  417.   (subitem "data: Object")
  418.   (subitem "data access: ObjectInspector"))

  419. (slide
  420.   #:title (title/sub object-model "In Memory Format 2")
  421.   (item "Base Interfaces")
  422.   (subitem (jcode "ObjectInspector"))
  423.   (subitem (jcode "PrimitiveObjectInspector")
  424.            (jcode "StructObjectInspector")
  425.            (jcode "MapObjectInspector")
  426.            (jcode "ListObjectInspector")
  427.            (jcode "Union(Struct)ObjectInspector"))

  428.   (item "Example")
  429.   (subitem (jcode "Integer") "+" (jcode "JavaIntegerObjectInspector"))
  430.   (subitem (jcode "IntWritable") "+"
  431.            (jcode "WritableIntegerObjectInspector")))

  432. (outline 'ideal-diff)

  433. (slide
  434.   #:title ideal-diff
  435.   (item "Differences")
  436.   (subitem "hive --> DDBMS")
  437.   (subitem "expection --> language")

  438.   (item "Commons")
  439.   (subitem "a SQL-like sub-language")
  440.   (subitem "compiler and runtime")

  441.   (item "Problem")
  442.   (subitem "metastore")
  443.   (subitem "name binding & abstraction"))

  444. (outline 'references)

  445. (slide
  446.   #:title references
  447.   (item "https://cwiki.apache.org/"
  448.         "confluence/display/Hive/Presentations")
  449.   (item "source code 0.7.1")
  450.   (blank)
  451.   (text "Thank you!" (current-main-font) title-font-size))

阅读(2458) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~