{"id":703,"date":"2023-11-24T09:11:26","date_gmt":"2023-11-24T00:11:26","guid":{"rendered":"https:\/\/skanto.co.kr\/?p=703"},"modified":"2023-12-23T11:59:21","modified_gmt":"2023-12-23T02:59:21","slug":"transformer","status":"publish","type":"post","link":"https:\/\/skanto.co.kr\/?p=703","title":{"rendered":"Transformer"},"content":{"rendered":"\n<p>\ucc38\uace0: <a href=\"http:\/\/jalammar.github.io\/illustrated-transformer\/\">The illustrated Transformer<\/a><\/p>\n\n\n\n<p>GPT\ub294 <strong>G<\/strong>enerative <strong>P<\/strong>re-trained <strong>T<\/strong>ransformer\uc758 \uc57d\uc790\ub77c\uace0 \uc54c\uace0 \uc788\ub2e4. \uc5ec\uae30\uc11c \uac00\uc7a5 \uc911\uc694\ud55c \uc5ed\ud560\uc744 \ud558\ub294 \uac83\uc774 Transformer\uc77c \uac83\uc774\ub2e4. \uadf8\ub807\ub2e4\uba74 Transformer\uac00 \uc5b4\ub5a4 \uae30\ub2a5\uc744 \ud558\uae30\uc5d0 \uac00\uc7a5 \uc911\uc694\ud55c\uc9c0 \uad81\uae08\uc99d\uc744 \uac00\uc9c0\uc9c0 \uc54a\uc744 \uc218 \uc5c6\ub2e4. \ub17c\ubb38\uc758 \ud3ec\ud568\ud574\uc11c \uc5ec\ub7ec \uc790\ub8cc\ub97c \uc0b4\ud3b4\ubd10\ub3c4 \uae00\uc790\ub9cc \ubcf4\uc774\uc9c0 \ubb38\ub9e5\uc774 \ubcf4\uc774\uc9c0 \uc54a\uc558\ub294\ub370 <a href=\"http:\/\/jalammar.github.io\/illustrated-transformer\/\">\uc774 \ubb38\uc11c<\/a>\ub97c \ubcf4\uace0\uc11c \ubb34\ub985\uc744 \ud0c1 \uce58\uac8c \ub418\uc5c8\ub2e4. \uc6b0\uc120 \uc26c\uc6b4 \uc124\uba85\uc744 \ud1b5\ud574 \uad81\uae08\uc99d\uc744 \ud574\uacb0\ud558\ub3c4\ub85d \ud574 \uc900 <a href=\"http:\/\/jalammar.github.io\/\">Jay Alammar<\/a>\uc5d0\uac8c \uac10\uc0ac\ub97c \ub4dc\ub9ac\uace0 \ub098\uc758 \uc5b8\uc5b4\ub85c \ub2e4\uc2dc \uc815\ub9ac\ud574 \ubcf4\uace0\uc790 \ud55c\ub2e4.<\/p>\n\n\n\n<p>\ub098\uc911\uc5d0 \uc790\uc138\ud558\uac8c \uc124\uba85\ud558\uaca0\uc9c0\ub9cc Transformer\ub294 \ub0b4\ubd80\uc801\uc73c\ub85c Attention\uc774\ub77c\ub294 \uac1c\ub150\uc744 \uc0ac\uc6a9\ud55c\ub2e4. \uc774\ub97c \uc0ac\uc6a9\ud568\uc73c\ub85c\uc368 \uadf8\ub3d9\uc548 \ubb38\uc81c\ub85c \uc9c0\uc801\ub410\ub358 \ub290\ub9b0 \ucc98\ub9ac \uc18d\ub3c4\ub97c \ud68d\uae30\uc801\uc73c\ub85c \uac1c\uc120\ud558\ub294 \uacc4\uae30\uac00 \ub418\uc5c8\ub2e4. \ube60\ub978 \uc18d\ub3c4\uc758 \uae30\uc220\uc801 \ubc30\uacbd\uc5d0\ub294 \ubcd1\ub82c\ucc98\ub9ac\uac00 \uc788\ub294\ub370 Google Neural Machine Translation Model\ubcf4\ub2e4 \uc131\ub2a5\uc744 \ub2a5\uac00\ud558\uba70 Google Cloud\uc5d0\uc11c\ub3c4 Transformer\ub97c \uae30\ubcf8 \ubaa8\ub378\ub85c \uc0ac\uc6a9\ud558\ub3c4\ub85d \ucd94\ucc9c\ud558\uace0 \uc788\ub2e4.<\/p>\n\n\n\n<p>Transformer\ub294 \uad6c\uae00\uc5d0\uc11c \ubc1c\ud45c\ud55c \ub17c\ubb38 &#8221;&nbsp;<a href=\"https:\/\/arxiv.org\/abs\/1706.03762\">Attention is All You Need<\/a>&#8220;\uc5d0\uc11c \ucc98\uc74c \uc18c\uac1c \ub418\uc5c8\uace0 \uc694\uc998, \uc774\ub97c \uad6c\ud604\ud55c \uad6c\ud604\uccb4(<a href=\"https:\/\/github.com\/tensorflow\/tensor2tensor\">Tensor2Tensor<\/a>, <a href=\"http:\/\/nlp.seas.harvard.edu\/2018\/04\/03\/attention.html\">PyTorch<\/a>\ub97c \ud65c\uc6a9\ud55c \uad6c\ud604 \ub4f1)\ub97c \uc27d\uac8c \ucc3e\uc544 \ubcfc \uc218 \uc788\ub2e4.<\/p>\n\n\n\n<p>\uadf8\ub7fc \uc9c0\uae08\ubd80\ud130 Transformer\uc758 \uac1c\ub150\uc744 \uc774\ud574\ud558\uae30 \uc27d\ub3c4\ub85d \ubca0\uc77c\uc744 \ud558\ub098\uc529 \ubc97\uaca8\ubcf4\ub3c4\ub85d \ud558\uaca0\ub2e4.<\/p>\n\n\n\n<h2 class=\"wp-block-heading\">A High-Level Look<\/h2>\n\n\n\n<p>Transformer\ub97c \uba87 \uac1c\uc758 \ubc15\uc2a4\ub85c \uc2ec\ud50c\ud558\uac8c \uadf8\ub824\ubcf4\uba74 \uc544\ub798\uc640 \uac19\ub2e4. \uc608\ub97c \ub4e4\uc5b4 \ud504\ub791\uc2a4\uc5b4\ub97c \uc601\uc5b4\ub85c \ubc88\uc5ed\ud574 \uc8fc\ub294 \ubc88\uc5ed \ud504\ub85c\uadf8\ub7a8\uc744 \uc0dd\uac01\ud574 \ubcfc \uc218 \uc788\ub2e4.<\/p>\n\n\n<div class=\"wp-block-image\">\n<figure class=\"aligncenter size-large is-resized\"><img decoding=\"async\" src=\"http:\/\/jalammar.github.io\/images\/t\/the_transformer_3.png\" alt=\"\" style=\"width:629px;height:auto\"\/><\/figure>\n<\/div>\n\n\n<p>\uc911\uac04 \ubc15\uc2a4 <a href=\"https:\/\/michaelbaystransformers.fandom.com\/wiki\/Optimus_Prime\">Optimus Prime<\/a>(Transformer ^^)\uc758 \ub69c\uaed1\uc744 \uc5f4\uc5b4\ubcf4\uba74 \uc544\ub798 \uadf8\ub9bc\ucc98\ub7fc \uc778\ucf54\ub529\uacfc \ub514\ucf54\ub529 \ucef4\ud3ec\ub10c\ud2b8 \uadf8\ub9ac\uace0 \uc774\ub4e4\uc0ac\uc774\uc758 \uc5f0\uacb0\ub85c \uad6c\uc131\ub41c\ub2e4.<\/p>\n\n\n<div class=\"wp-block-image\">\n<figure class=\"aligncenter size-large is-resized\"><img decoding=\"async\" src=\"http:\/\/jalammar.github.io\/images\/t\/The_transformer_encoders_decoders.png\" alt=\"\" style=\"width:516px;height:auto\"\/><\/figure>\n<\/div>\n\n\n<p>\uc704 \uadf8\ub9bc\uc5d0\uc11c ENCODERS\ub85c \ud45c\ud604\ub41c \uc778\ucf54\ub529 \ucef4\ud3ec\ub10c\ud2b8 \ub0b4\ubd80\ub294 \uc544\ub798 \uadf8\ub9bc\ucc98\ub7fc \uc5ec\ub7ec \uac1c\uc758 Encoder\ub97c \uc2a4\ud0dd\uc73c\ub85c \uc313\uc544 \uc62c\ub9b0 \uad6c\uc870\ub97c \uac00\uc9c4\ub2e4.(\uc544\ub798 \uadf8\ub9bc\uc740 6\uac1c\ub97c \ud45c\ud604\ud588\uc73c\ub098 \ud2b9\ubcc4\ud55c \uc758\ubbf8\uac00 \uc788\ub294 \uac83\uc740 \uc544\ub2c8\uba70 \ub2e4\ub974\uac8c \uc124\uc815 \uac00\ub2a5\ud558\ub2e4) \ub514\ucf54\ub529 \ucef4\ud3ec\ub10c\ud2b8\ub3c4 \ub3d9\uc77c\ud558\uac8c Encoder\uc640 \uac19\uc740 \uac1c\uc218\uc758 Decoder\ub97c \uc2a4\ud0dd\uc73c\ub85c \uc313\uc740 \uad6c\uc870\ub97c \uac00\uc9c4\ub2e4.<\/p>\n\n\n<div class=\"wp-block-image\">\n<figure class=\"aligncenter size-large is-resized\"><img decoding=\"async\" src=\"http:\/\/jalammar.github.io\/images\/t\/The_transformer_encoder_decoder_stack.png\" alt=\"\" style=\"width:519px;height:auto\"\/><\/figure>\n<\/div>\n\n\n<p>Encoder\uc758 \uad6c\uc870\ub294 \ubaa8\ub450 \ub3d9\uc77c\ud558\uc9c0\ub9cc Encoder\ub4e4\uac04\uc5d0 \uc801\uc6a9\ub418\ub294 \uac00\uc911\uce58(Weight)\ub294 \uc11c\ub85c \ub2e4\ub974\ub2e4. \uc774\ub4e4 \uac01\uac01\uc758 Encoder\ub294 \ub0b4\ubd80\uc801\uc73c\ub85c \uc544\ub798\uc640 \uac19\uc774 sub layer\ub4e4\ub85c \uad6c\uc131\ub41c\ub2e4.<\/p>\n\n\n<div class=\"wp-block-image\">\n<figure class=\"aligncenter size-large is-resized\"><img decoding=\"async\" src=\"http:\/\/jalammar.github.io\/images\/t\/Transformer_encoder.png\" alt=\"\" style=\"width:517px;height:auto\"\/><\/figure>\n<\/div>\n\n\n<p>Encoder\uc5d0 \uc785\ub825\ub418\ub294 input\uac12\uc740 \uba3c\uc800 Self-Attention Layer\ub97c \ud1b5\uacfc\ud55c\ub2e4.  \uc774 Layer\ub294 \uc8fc\uc5b4\uc9c4 \ubb38\uc7a5(sentence)\uc758 \ud2b9\uc815 \ub2e8\uc5b4\ub97c Encoding\ud560 \uc2dc\uc810\uc5d0 \uc774 \ub2e8\uc5b4\ub97c \uae30\uc900\uc73c\ub85c \ub2e4\ub978 \ub2e8\uc5b4\uc640\uc758 \uad00\uacc4\uac00 \uc5b4\ub5a4\uc9c0\ub97c \ud30c\uc545\ud558\ub294 \uc5ed\ud560\uc744 \ud55c\ub2e4. (\uc774\ud6c4\uc5d0 Self-Encoder\uc5d0 \ub300\ud574 \uc790\uc138\ud558\uac8c \uc54c\uc544\ubcfc \uac83\uc784)<\/p>\n\n\n\n<p>Self-Attention\uc73c\ub85c\ubd80\ud130 \ucd9c\ub825\ub418\ub294 \uacb0\uacfc(Output)\ub294 Feed Forward Neural Network\uc758 \uc785\ub825(Input)\uac12\uc73c\ub85c \ub4e4\uc5b4\uac04\ub2e4. \uc815\ud655\ud788 \uac01\uac01 \ub2e8\uc5b4\ub4e4\uc5d0 \ub3c5\ub9bd\uc801\uc73c\ub85c \ub3d9\uc77c\ud55c Feed Forward Neural Network\uc774 \uc801\uc6a9\ub41c\ub2e4. Decoder\ub3c4 \uc774\uc640 \ub3d9\uc77c\ud558\uac8c 2\uac1c\uc758 Layer\ub97c \uac00\uc9c4\ub2e4. \uadf8\ub7ec\ub098 \ub450 Layer\uc0ac\uc774\uc5d0 Attention Layer\ub77c\ub294 \uac83\uc774 \ub354 \uc788\uc73c\uba70 Decoding\uacfc\uc815\uc5d0\uc11c Decoder\uac00 \uc785\ub825\uc73c\ub85c \uc8fc\uc5b4\uc9c4 \ubb38\uc7a5\uc5d0\uc11c \uc5f0\uad00\uc131\uc774 \ub192\uc740 \ub2e8\uc5b4\uc5d0 Focusing\ud558\ub3c4\ub85d \ub3c4\uc640\uc900\ub2e4.<\/p>\n\n\n<div class=\"wp-block-image\">\n<figure class=\"aligncenter size-large is-resized\"><img decoding=\"async\" src=\"http:\/\/jalammar.github.io\/images\/t\/Transformer_decoder.png\" alt=\"\" style=\"width:537px;height:auto\"\/><\/figure>\n<\/div>\n\n\n<h2 class=\"wp-block-heading\">Bringing The Tensors Into The Picture<\/h2>\n\n\n\n<p>Transformer\uc758 \uc8fc\uc694 \ucef4\ud3ec\ub10c\ud2b8\uc5d0 \ub300\ud574\uc11c \uc0b4\ud3b4\ubd24\ub2e4\uba74 \uc774\uc81c \uc785\ub825\uac12\uc774 \uc774\ub4e4 \ucef4\ud3ec\ub10c\ud2b8 \uc0ac\uc774\ub97c \uc774\ub3d9\ud558\uba74\uc11c \uc5b4\ub5bb\uac8c \ucd9c\ub825\uac12\uc73c\ub85c \ubcc0\ud574\uac00\ub294\uc9c0\uc5d0 \ub300\ud574 \uc54c\uc544 \ubcf4\uc790.<\/p>\n\n\n<div class=\"wp-block-image\">\n<figure class=\"aligncenter size-large is-resized\"><img decoding=\"async\" src=\"http:\/\/jalammar.github.io\/images\/t\/embeddings.png\" alt=\"\" style=\"width:640px;height:auto\"\/><figcaption class=\"wp-element-caption\">\uac01 \ub2e8\uc5b4\ub294 512 \ud06c\uae30\uc758 \ubca1\ud130\ub85c embedding\ub418\uc5c8\ub2e4. \uc5ec\uae30\uc11c\ub294 \uac04\ub2e8\ud558\uac8c \ubc15\uc2a4\ub85c \ud45c\ud604\ud55c\ub2e4.<\/figcaption><\/figure>\n<\/div>\n\n\n<p>Embedding \uacfc\uc815\uc740 Encoder \uc2a4\ud0dd \uc911 \uac00\uc7a5 \ud558\ub2e8\uc5d0 \uc704\uce58\ud55c Encoder\uc5d0\uc11c\ub9cc \uc77c\uc5b4\ub09c\ub2e4. \ud558\uc9c0\ub9cc \ub2e4\ub978 Encoder\ub4e4\uacfc \ub3d9\uc77c\ud558\uac8c \uc801\uc6a9\ub418\ub294 \ubd80\ubd84\uc744 \ucd94\uc0c1\ud654 \ud574\ubcf4\uc790\uba74 512 \ud06c\uae30\uc758 Vector\ub97c \uc785\ub825\uac12\uc73c\ub85c \ubc1b\ub294\ub2e4\ub294 \uac83\uc774\ub2e4. \ub2e8\uc9c0 \ucc28\uc774\uac00 \uc788\ub2e4\uba74 \uac00\uc7a5 \ud558\ub2e8\uc758 Encoder\ub294 Embedding\uc744 \uc785\ub825 \uac12\uc73c\ub85c \ubc1b\uc9c0\ub9cc \uadf8 \uc717\ub2e8\uc758 Encoder\ub4e4\uc740 \ubc14\ub85c \ubc11\uc5d0 \uc788\ub294 Encoder\uc758 \ucd9c\ub825\uc744 \uc785\ub825\uac12\uc73c\ub85c \ubc1b\ub294\ub2e4\ub294 \uac83\uc774\ub2e4. \uc774 Vector\uc758 \ud06c\uae30\ub294 hyperparameter\ub85c \uc124\uc815\uc774 \uac00\ub2a5\ud558\uba70 \uc77c\ubc18\uc801\uc73c\ub85c Training Dataset \uc911\uc5d0\uc11c \uac00\uc7a5 \uae34 \ubb38\uc7a5\uc758 \uae38\uc774\ub97c \uac12\uc73c\ub85c \uc124\uc815\ud55c\ub2e4.<\/p>\n\n\n\n<p>\uc785\ub825 \ubb38\uc7a5\uc744 Embedding\ud558\uace0 \ub098\uba74 \uc774\ub4e4 \uac01\uac01\uc740 \uc544\ub798 \uadf8\ub9bc\uacfc \uac19\uc774 Encoder \ub0b4\ubd80\uc758 \ub450 \uac1c Layer\ub97c \ud1b5\uacfc\ud55c\ub2e4.<\/p>\n\n\n<div class=\"wp-block-image\">\n<figure class=\"aligncenter size-large is-resized\"><img decoding=\"async\" src=\"http:\/\/jalammar.github.io\/images\/t\/encoder_with_tensors.png\" alt=\"\" style=\"width:633px;height:auto\"\/><\/figure>\n<\/div>\n\n\n<p>\uc5ec\uae30\uc11c Transformer\uac00 \uac00\uc9c4 \uc18d\uc131 \uc911 \ud558\ub098\ub97c \ubcfc \uc218 \uc788\ub294\ub370 \uac01 \uc704\uce58\ubcc4 \ub2e8\uc5b4\ub294 \uac1c\ubcc4\uc801\uc73c\ub85c Encoder\ub97c \ud1b5\uacfc\ud55c\ub2e4\ub294 \uac83\uc774\ub2e4. Self-attention layer \ub0b4\ubd80\uc5d0\uc11c \uc774\ub4e4\uc758 path\ub294 \uc11c\ub85c \uc758\uc874\uc801\uc774\uc9c0\ub9cc Feed-forward layer\uc5d0\uc11c\ub294 \uc11c\ub85c\uac04\uc758 \uc758\uc874\uc131\uc774 \uc874\uc7ac\ud558\uc9c0 \uc54a\ub294\ub2e4. \ub530\ub77c\uc11c, Feed-forward layer\ub97c \ud1b5\uacfc\ud560 \ub54c\ub294 \uac1c\ubcc4 path\ub294 \ubcd1\ub82c\ub85c \uc2e4\ud589 \uac00\ub2a5\ud558\ub2e4.<\/p>\n\n\n\n<p>\uc774\uc81c \uc9e7\uc740 \ubb38\uc7a5\uc744 \uc608\ub85c \ub4e4\uc5b4 Encoder\uc758 \ub0b4\uc758 \uac01\uac01  sub Layer\uc5d0\uc11c \uc5b4\ub5a4 \uc77c\uc774 \ubc1c\uc0dd\ud558\ub294\uc9c0 \uc0b4\ud3b4 \ubcf4\uc790.<\/p>\n\n\n\n<h2 class=\"wp-block-heading\">Now We\u2019re Encoding!<\/h2>\n\n\n\n<p>\uc55e\uc5d0\uc11c \uc124\uba85\ud588\ub4ef\uc774, Encoder\ub294 Vector(Embedding)\ud615\ud0dc\ub85c \uc785\ub825\uac12\uc744 \ubc1b\ub294\ub2e4. \uc774 Vector\ub294 Encoder \ub0b4\ubd80\uc5d0\uc11c self-attention layer\ub97c \ud1b5\uacfc \ud55c \ud6c4 \ub2e4\uc2dc feed-forward layer\ub85c \ub4e4\uc5b4\uac04\ub2e4. \uadf8\ub7f0 \ub2e4\uc74c \uadf8 \uacb0\uacfc(Output)\ub294 \ub2e4\uc74c \ub2e8\uacc4 Encoder\uc758 \uc785\ub825\uac12\uc73c\ub85c \ub4e4\uc5b4\uac04\ub2e4.<\/p>\n\n\n<div class=\"wp-block-image\">\n<figure class=\"aligncenter size-large is-resized\"><img decoding=\"async\" src=\"http:\/\/jalammar.github.io\/images\/t\/encoder_with_tensors_2.png\" alt=\"\" style=\"width:697px;height:auto\"\/><figcaption class=\"wp-element-caption\">The word at each position passes through a self-attention process. Then, they each pass through a feed-forward neural network &#8212; the exact same network with each vector flowing through it separately.<\/figcaption><\/figure>\n<\/div>\n\n\n<h2 class=\"wp-block-heading\">Self-Attention at a High Level<\/h2>\n\n\n\n<p>Self-Attention\uc774\ub77c\ub294 \uc6a9\uc5b4\uac00 \uc0dd\uacbd\ud560 \uc218 \uc788\ub2e4. \uc801\uc5b4\ub3c4 \uad6c\uae00\uc5d0\uc11c &#8220;Attention is All You Need paper&#8221;\ub77c\ub294 \ub17c\ubb38\uc744 \ubc1c\ud45c\ud558\uae30 \uc804\uae4c\uc9c0\ub294 \ub300\ubd80\ubd84 \uadf8 \uac1c\ub150\uc744 \ubab0\ub790\ub2e4. \uadf8\ub7fc \uc774\uc81c \uc774\uac83\uc774 \uc5b4\ub5bb\uac8c \ub3d9\uc791\ud558\ub294\uc9c0 \uc54c\uc544\ubcf4\uc790.<\/p>\n\n\n\n<p>\ub2e4\uc74c \ubb38\uc7a5\uc744 \ubc88\uc5ed\ud558\ub824\ub294 \uc785\ub825\uac12\uc774\ub77c\uace0 \ud574\ubcf4\uc790.<\/p>\n\n\n\n<blockquote class=\"wp-block-quote is-layout-flow wp-block-quote-is-layout-flow\">\n<p>\u201d<code>The animal didn't cross the street because it was too tired<\/code>\u201d<\/p>\n<\/blockquote>\n\n\n\n<p>\uc774 \ubb38\uc7a5\uc5d0\uc11c &#8216;it&#8221;\uc740 \ubb34\uc5c7\uc744 \uac00\ub9ac\ud0a4\ub294\uac00? it\uc774 street\ub97c \uac00\ub9ac\ud0a4\ub294\uac00 \uc544\ub2c8\uba74 animal\uc744 \uac00\ub9ac\ud0a4\ub294\uac00? \uc0ac\ub78c\ub4e4\uc5d0\uac8c\ub294 \ub2e8\uc21c\ud574 \ubcf4\uc774\ub294 \uc9c8\ubb38\uc774\uc9c0\ub9cc \uc54c\uace0\ub9ac\uc998\uc5d0\uac8c\ub294 \uadf8\ub807\uc9c0 \uc54a\ub2e4.<\/p>\n\n\n\n<p>&#8220;it&#8221;\uc774\ub77c\ub294 \ub2e8\uc5b4\ub97c \ubaa8\ub378\uc774 \ucc98\ub9ac\ud558\uace0 \ub098\uba74 self-attention\uc740 &#8220;it&#8221;\uc774 &#8220;animal&#8221;\uacfc \uac00\ub9ac\ud0a4\ub3c4\ub85d(\uad00\ub828\uc131) \ud55c\ub2e4.<\/p>\n\n\n\n<p><span style=\"text-decoration: underline;\">\uc785\ub825\ub41c \ubb38\uc7a5\uc5d0\uc11c \ubaa8\ub378\uc774 \ud2b9\uc815 \uc704\uce58\uc5d0 \uc788\ub294 \ub2e8\uc5b4\ub97c \ud558\ub098\uc529 \ucc98\ub9ac\ud574 \ub098\uac08 \ub54c self attention\uc740 \ub2e8\uc5b4 \uc21c\uc11c\uc5d0\uc11c \ub2e4\ub978 \uc704\uce58\uc5d0 \uc788\ub294 \ub2e8\uc5b4\ub4e4\uc744 \uad00\ucc30\ud568\uc73c\ub85c\uc368 \ub354 \ub098\uc740 \uc778\ucf54\ub529\uc744 \uc704\ud55c \ub2e8\uc11c\ub97c \ucc3e\uc544\ub0b8\ub2e4.<\/span> <span style=\"text-decoration: underline;\"><strong>Self-attention\uc740 Transformer\uac00 \uc0ac\uc6a9\ud558\ub294 \ubc29\ubc95\uc73c\ub85c \ud604\uc7ac \ucc98\ub9ac\uc911\uc778 \ub2e8\uc5b4\uc640 \ub2e4\ub978 \ub2e8\uc5b4\ub4e4\uacfc\uc758 \uc5f0\uad00\uc131\uc5d0 \ub300\ud55c &#8220;\uc774\ud574\ub3c4(understanding)&#8221;\ub97c \ud604\uc7ac \ucc98\ub9ac \uc911\uc778 \ub2e8\uc5b4 \ub179\uc5ec \ub123\ub294 \ubc29\ubc95\uc774\ub77c \ud560 \uc218 \uc788\ub2e4.<\/strong><\/span> (RNN\uc5d0\uc11c hidden state\ub97c \uc720\uc9c0\ud558\ub294 \uac83\uacfc \uc720\uc0ac)<\/p>\n\n\n<div class=\"wp-block-image\">\n<figure class=\"aligncenter size-large\"><img decoding=\"async\" src=\"http:\/\/jalammar.github.io\/images\/t\/transformer_self-attention_visualization.png\" alt=\"\"\/><figcaption class=\"wp-element-caption\">As we are encoding the word &#8220;it&#8221; in encoder #5 (the top encoder in the stack), part of the attention mechanism was focusing on &#8220;The Animal&#8221;, and baked a part of its representation into the encoding of &#8220;it&#8221;.<\/figcaption><\/figure>\n<\/div>\n\n\n<h2 class=\"wp-block-heading\">Self-Attention in Detail<\/h2>\n\n\n\n<p>Vector\ub97c \uc774\uc6a9\ud574\uc11c self-attention\uc744 \uc5b4\ub5bb\uac8c \uacc4\uc0b0\ud558\ub294\uc9c0 \uc54c\uc544\ubcf4\uc790. \uadf8 \ub2e4\uc74c\uc73c\ub85c \uc774 \ub9e4\ud2b8\ub9ad\uc2a4\ub97c \ud65c\uc6a9\ud574\uc11c \uc2e4\uc81c \uc5b4\ub5bb\uac8c \uad6c\ud604\ub418\ub294\uc9c0\ub3c4 \uc0b4\ud3b4\ubcf4\uc790<\/p>\n\n\n\n<p><strong>\uccab \ubc88\uc9f8 \ub2e8\uacc4<\/strong>\ub294 \uc785\ub825\uac12\uc73c\ub85c \uc8fc\uc5b4\uc9c0\ub294 \uac01 \ub2e8\uc5b4\ubcc4 Embedding\uc73c\ub85c\ubd80\ud130 3\uac1c\uc758 Vector\ub97c \ub9cc\ub4e0\ub2e4. \uc989, \uac01 \ub2e8\uc5b4\ubcc4\ub85c Query Vector, Key Vector, Value Vector\ub97c \uac01\uac01 \ub3c4\ucd9c\ud55c\ub2e4. \uc774\ub4e4 Vector\ub294 Training\uc744 \ud1b5\ud574 \uc5bb\uc740 3\uac1c\uc758 \uac00\uc911\uce58(weight) \ub9e4\ud2b8\ub9ad\uc2a4\ub97c Embedding\uacfc \uac01\uac01 \uacf1\ud574\uc11c \uad6c\ud55c\ub2e4.<\/p>\n\n\n\n<p>\uc774\ub807\uac8c \uc0dd\uc131\ub41c Vector\uc758 \ucc28\uc6d0(dimension)\uc740 Embedding Vector\ubcf4\ub2e4 \ub0ae\ub2e4\ub294 \uac83\uc5d0 \uc8fc\ubaa9\ud558\ub2e4. \uc0dd\uc131\ub41c Vector\uc758 \ucc28\uc6d0\uc740 64\uc774\uc9c0\ub9cc Embedding\uacfc \uc785\/\ucd9c\ub825 Vector\ub4e4\uc740 512 \ucc28\uc6d0(dimension)\uc744 \uac00\uc9c4\ub2e4. \ucc28\uc6d0\uc774 \ubc18\ub4dc\uc2dc \ub0ae\uc744 \ud544\uc694\ub294 \uc5c6\uc73c\uba70 \ub2e8\uc9c0 multiheaded attention \uc5f0\uc0b0\uc774 \uc77c\uc815\ud558\ub3c4\ub85d \ud558\uae30 \uc704\ud574 \uc544\ud0a4\ud14d\ucc98\uc801\uc73c\ub85c \uc120\ud0dd\ud55c \uac12\uc774\ub2e4.<\/p>\n\n\n<div class=\"wp-block-image\">\n<figure class=\"aligncenter size-large is-resized\"><img decoding=\"async\" src=\"http:\/\/jalammar.github.io\/images\/t\/transformer_self_attention_vectors.png\" alt=\"\" style=\"width:651px;height:auto\"\/><figcaption class=\"wp-element-caption\">Multiplying&nbsp;x1&nbsp;by the&nbsp;WQ&nbsp;weight matrix produces&nbsp;q1, the &#8220;query&#8221; vector associated with that word. We end up creating a &#8220;query&#8221;, a &#8220;key&#8221;, and a &#8220;value&#8221; projection of each word in the input sentence.<\/figcaption><\/figure>\n<\/div>\n\n\n<p>\uadf8\ub807\ub2e4\uba74 &#8220;query&#8221;, &#8220;key&#8221;, &#8220;value&#8221; vector\ub294 \uac01\uac01 \ubb34\uc5c7\uc77c\uae4c? attention \uc5f0\uc0b0\uacfc attention\uc744 \uace0\ub824\ud568\uc5d0 \uc788\uc5b4 \uc720\uc6a9\ud55c \ucd94\uc0c1\uc801\uc778 \uac1c\ub150\uc774\ub77c \uc0dd\uac01\ud558\uba74 \ub41c\ub2e4. \uc608\ub97c \ub4e4\uc5b4 \uc124\uba85\ud558\uc790\uba74 key\/value\/query \uac1c\ub150\uc740 \uc815\ubcf4 \ucd94\ucd9c \uc2dc\uc2a4\ud15c\uacfc \uc720\uc0ac\ud558\ub2e4. Youtube\uc5d0\uc11c \ube44\ub514\uc624\ub97c \uac80\uc0c9\ud560 \ub54c \uac80\uc0c9\uc5d4\uc9c4\uc740 query(\uac80\uc0c9\uc5b4)\ub97c \ubc14\ud0d5\uc73c\ub85c \ub370\uc774\ud130\ubca0\uc774\uc2a4\uc5d0 \uc788\ub294 \uad00\ub828 \ud6c4\ubcf4 \ube44\ub514\uc624\ub4e4\uc758  key(\ube44\ub514\uc624\uc758 \ud0c0\uc774\ud2c0, \uc124\uba85 \ub4f1)\uc744 \ub9e4\ud551\uc2dc\ud0a8\ub2e4\uc74c \uac00\uc7a5 \uc798 \ub9e4\uce6d\ub418\ub294 \ube44\ub514\uc624\ub4e4(values)\ub97c \uacb0\uacfc\ub85c \ubcf4\uc5ec\uc900\ub2e4.  \uc544\ub798\uc758 \uc124\uba85\uc744 \ubcf4\uba74 \uc774\ub4e4 \uac12(vector)\uc758 \uc5ed\ud560\uc774 \ubb34\uc5c7\uc778\uc9c0 \uc27d\uac8c \uc774\ud574\ud560 \uc218 \uc788\uc744 \uac83\uc774\ub2e4.<\/p>\n\n\n\n<p><strong>\ub450 \ubc88\uc9f8 \ub2e8\uacc4<\/strong>\ub294 score(attention score)\ub97c \uacc4\uc0b0\ud558\ub294 \uac83\uc774\ub2e4. \uc608\uc81c\uc5d0 \uc788\ub294 \uccab\ubc88\uc9f8 \ub2e8\uc5b4(Thinking)\uc5d0 \ub300\ud574 self-attention\uc744 \uacc4\uc0b0\ud55c\ub2e4\uace0 \ud574\ubcf4\uc790. \uc774 \ub2e8\uc5b4(Thinking)\uc744 \uae30\uc900\uc73c\ub85c \ub2e4\ub978 \ub2e8\uc5b4\ub4e4\uc5d0 \ub300\ud55c attention score\ub97c \uacc4\uc0b0\ud560 \ud544\uc694\uac00 \uc788\ub2e4. <span style=\"text-decoration: underline;\">\uc774 score\ub294 \uc785\ub825 \ubb38\uc7a5\uc5d0\uc11c \ud2b9\uc815 \uc704\uce58\uc758 \ub2e8\uc5b4\ub97c Encoding\ud560 \ub54c \ub2e4\ub978 \ub2e8\uc5b4\ub4e4\uc5d0 \ub300\ud574 \uc5b4\ub290 \uc815\ub3c4\uc758 \uad00\uc2ec(focus)\ub97c \ub450\uc5b4\uc57c \ud558\ub294\uc9c0\ub97c \uacb0\uc815<\/span>\ud55c\ub2e4.<\/p>\n\n\n\n<p>\uc774 score\ub294 \ud574\ub2f9 \ub2e8\uc5b4\uc758 query vector\uc640 key vector\ub97c dot product\ub85c \uacc4\uc0b0\ud558\uba70 #1\uc704\uce58\uc5d0 \uc788\ub294 \ub2e8\uc5b4\uc758 self-attention\uc744 \ucc98\ub9ac\ud560 \uacbd\uc6b0 score\ub294 q1\uacfc k1\uc758 dot product\uac00 \ub420 \uac83\uc774\uace0 \ub450\ubc88\uc9f8 score\ub294 q1\uacfc k2\uc758 dot product\uac00 \ub41c\ub2e4.<\/p>\n\n\n<div class=\"wp-block-image\">\n<figure class=\"aligncenter size-large\"><img decoding=\"async\" src=\"http:\/\/jalammar.github.io\/images\/t\/transformer_self_attention_score.png\" alt=\"\"\/><\/figure>\n<\/div>\n\n\n<p><strong>\uc138 \ubc88\uc9f8\uc640 \ub124 \ubc88\uc9f8 \ub2e8\uacc4<\/strong>\ub294  \uc774\ub807\uac8c \uacc4\uc0b0\ub41c score\ub97c 8\ub85c \ub098\ub208\ub2e4.(key vector\uc758 \ucc28\uc6d0 -64- \uc758 \uc81c\uacf1\uadfc. <span style=\"text-decoration: underline;\">\uc774\ub807\uac8c \ud558\uba74 \ub354\uc6b1 \uc548\uc815\uc801\uc778 \uae30\uc6b8\uae30\ub97c \ub9cc\ub4e4\uc5b4 \ub0bc \uc218 \uc788\ub2e4\uace0 \ud55c\ub2e4<\/span>. \ub2e4\ub978 \uac12\uc774 \uc788\uc744 \uc218 \uc788\uc9c0\ub9cc default\uac12\uc774\ub77c\uace0\ud568) \uadf8\ub7f0 \ub2e4\uc74c, \uadf8 \uacb0\uacfc\ub97c softmax \uc5f0\uc0b0\uc744\ud558\uba70, \uc774\ub807\uac8c \ud558\uba74 score\uac00 0 ~ 1 \uc0ac\uc774\uc758 \ud655\ub960\uc744 \uac16\ub3c4\ub85d normalize \ub41c\ub2e4.<\/p>\n\n\n<div class=\"wp-block-image\">\n<figure class=\"aligncenter size-large is-resized\"><img decoding=\"async\" src=\"http:\/\/jalammar.github.io\/images\/t\/self-attention_softmax.png\" alt=\"\" style=\"width:705px;height:auto\"\/><\/figure>\n<\/div>\n\n\n<p>Softmax score\ub294 \ud604\uc7ac \ub2e8\uc5b4\ub97c \uae30\uc900\uc73c\ub85c \ub2e4\ub978 \ub2e8\uc5b4\ub4e4\uc774 \uc5bc\ub9c8\ub098 \ub9ce\uc740 \uad00\uc2ec\uc744 \uac00\uc9c0\ub294\uc9c0\ub97c \uacb0\uc815\ud55c\ub2e4. \ud604\uc7ac \uc704\uce58\uc758 \ub2e8\uc5b4\uac00 \uac00\uc7a5 \ub192\uc740 softmax score\ub97c \uac00\uc9c0\ub294 \uac83\uc744 \ubcfc \uc218 \uc788\ub2e4. \uadf8\ub7ec\ub098 \ub584\ub85c \ud604\uc7ac \uc704\uce58\uc758 \ub2e8\uc5b4\uc640 \ub2e4\ub978 \ub2e8\uc5b4\uac00 \uc5b4\ub290\uc815\ub3c4 \uad00\ub828\uc131\uc774 \uc788\ub294\uc9c0 \ud655\uc778\ud558\ub294\ub370 \uc720\uc6a9\ud560 \uc218 \uc788\ub2e4.<\/p>\n\n\n\n<p><strong>\ub2e4\uc12f\ubc88\uc9f8 \ub2e8\uacc4<\/strong>\ub294 value vector\ub97c \uc55e\uc5d0\uc11c \uacc4\uc0b0\ud55c softmax score\uc640 \uacf1\ud55c\ub2e4. \uc774\ub807\uac8c \ud558\ub294 \uc774\uc720\ub294 <span style=\"text-decoration: underline;\">\uad00\uc2ec\uc744 \ub450\uace0\uc788\ub294 \ub2e8\uc5b4\ub294 \ubcc0\uacbd\uc5c6\uc774 \uc720\uc9c0\ub418\uac8c \ud558\ub3c4\ub85d \ud558\uace0, \uad00\ub828\uc131\uc774 \uc801\uc740 \ub2e8\uc5b4\ub294 \ubc30\uc81c\uc2dc\ud0a4\uae30 \uc704\ud574\uc11c\uc774\ub2e4.<\/span>(\uc608\ub97c \ub4e4\uba74 \uc544\uc8fc \uc791\uc740 \uac12(0.001)\uc744 \uacf1\ud558\uba74 \uadf8 value\ub294 \uc544\uc8fc \ubbf8\ubbf8\ud574 \uc9c4\ub2e4)<\/p>\n\n\n\n<p><strong>\uc5ec\uc12f\ubc88\uc9f8 \ub2e8\uacc4<\/strong>\ub294 \uc774\ub807\uac8c \uacf1\ud574\uc11c \uc5bb\uc740 value vector\ub97c \ubaa8\ub450 \ub354\ud55c\ub2e4. \uc774\ub807\uac8c \ud558\uba74 \ud604\uc7ac \uc704\uce58\uc5d0 \ud574\ub2f9\ud558\ub294 \ub2e8\uc5b4\uc5d0 \ub300\ud574 self-attention layer\uc758 \uacb0\uacfc\uac12\uc744 \uc5bb\uac8c \ub41c\ub2e4.<\/p>\n\n\n<div class=\"wp-block-image\">\n<figure class=\"aligncenter size-large is-resized\"><img decoding=\"async\" src=\"http:\/\/jalammar.github.io\/images\/t\/self-attention-output.png\" alt=\"\" style=\"width:673px;height:auto\"\/><\/figure>\n<\/div>\n\n\n<p>\uc774\ub807\uac8c self-attention \uacc4\uc0b0\uc774 \uc885\ub8cc\ub418\uace0, \uacb0\uacfc vector\ub294 feed-forward neural network\uc73c\ub85c \uc804\ub2ec\ub41c\ub2e4. \uc2e4\uc81c \uad6c\ud604\ub2e8\uacc4\uc5d0\uc11c\ub294 \uc774 \uacc4\uc0b0\uc740 \ube60\ub978 \ucc98\ub9ac\ub97c \uc704\ud574 matrix\uc5f0\uc0b0\uc73c\ub85c \uc774\ub8e8\uc5b4\uc9c4\ub2e4. \uc9c0\uae08\uae4c\uc9c0 \ub2e8\uc5b4 \ub808\ubca8\uc5d0\uc11c \uc5b4\ub5bb\uac8c \uc5f0\uc0b0 \uc774\ub8e8\uc5b4\uc9c0\ub294\uc9c0 \uac1c\ub150\uc801\uc73c\ub85c \ubd24\uc73c\ub2c8 \uc2e4\uc81c \uc5b4\ub5bb\uac8c \uacc4\uc0b0\ub418\ub294\uc9c0 \uc54c\uc544\ubcf4\uc790.<\/p>\n\n\n\n<h2 class=\"wp-block-heading\">Matrix Calculation of Self-Attention<\/h2>\n\n\n\n<p><strong>\uccab \ubc88\uc9f8 \ub2e8\uacc4<\/strong>\ub294 Query, Key, Value matrix\ub97c \uacc4\uc0b0\ud558\ub294 \uac83\uc774\ub2e4. \uba3c\uc800 embedding\uc744 matrix X\ub85c \ubcc0\ud658\ud55c \ud6c4 \uc774 \uac12\uc744 \uac00\uc911\uce58 matrix(weighted matrix &#8211; training\uace0\uc815\uc5d0\uc11c \uc5bb\uc740 \uac12)(WQ, WK, WV)\uc640 \uacf1\uc148\ud55c\ub2e4.<\/p>\n\n\n<div class=\"wp-block-image\">\n<figure class=\"aligncenter size-large is-resized\"><img decoding=\"async\" src=\"http:\/\/jalammar.github.io\/images\/t\/self-attention-matrix-calculation.png\" alt=\"\" style=\"width:485px;height:auto\"\/><figcaption class=\"wp-element-caption\">Every row in the&nbsp;X&nbsp;matrix corresponds to a word in the input sentence. We again see the difference in size of the embedding vector (512, or 4 boxes in the figure), and the q\/k\/v vectors (64, or 3 boxes in the figure)<\/figcaption><\/figure>\n<\/div>\n\n\n<p><strong>\ub9c8\uc9c0\ub9c9 \ub2e8\uacc4<\/strong>\ub85c, matrix\uac00 \ub9cc\ub4e4\uc5b4 \uc84c\uc73c\ubbc0\ub85c \uc704 \uc124\uba85\uc758 2\ubc88\uc9f8\ubd80\ud130 6\ubc88\uc9f8 \ub2e8\uacc4\ub294 \uc544\ub798\uc640 \uac19\uc774 \ud558\ub098\uc758 \uc2dd\uc73c\ub85c \uac04\ub7b5\ud558\uac8c \ud45c\ud604\ud560 \uc218 \uc788\ub2e4. <\/p>\n\n\n<div class=\"wp-block-image\">\n<figure class=\"aligncenter size-large is-resized\"><img decoding=\"async\" src=\"http:\/\/jalammar.github.io\/images\/t\/self-attention-matrix-calculation-2.png\" alt=\"\" style=\"width:619px;height:auto\"\/><figcaption class=\"wp-element-caption\">The self-attention calculation in matrix form<\/figcaption><\/figure>\n<\/div>\n\n\n<h2 class=\"wp-block-heading\">The Beast With Many Heads<\/h2>\n\n\n\n<p>\uad6c\uae00\uc5d0\uc11c \ubc1c\ud45c\ud55c \ub17c\ubb38\uc740 Self-attention layer\uc5d0 multi-headed&#8221; attention\uc774\ub77c\ub294 \uac1c\ub150\uc744 \ucd94\uac00\ud574\uc11c self-attention layer\ub97c \ub354 \uc815\uad50\ud558\uac8c \ub9cc\ub4e4\uc5c8\ub2e4. \uc774\ub294 \ub2e4\uc74c\uacfc \uac19\uc774 2\uac00\uc9c0 \uce21\uba74\uc5d0\uc11c attention layer\uc758 \uc131\ub2a5\ub825 \ud5a5\uc0c1\uc2dc\ucf30\ub2e4.<\/p>\n\n\n\n<ol class=\"wp-block-list\">\n<li>\ubb38\uc7a5(sentence)\uc5d0\uc11c \uc11c\ub85c \ub2e4\ub978 \uc704\uce58\uc758 \ub2e8\uc5b4\uc5d0 \uc9d1\uc911\ud560 \uc218 \uc788\ub3c4\ub85d \ud568\uc73c\ub85c\uc368 \ubaa8\ub378 \uc131\ub2a5\uc744 \ud655\uc7a5\uc2dc\ucf30\ub2e4. \uadf8\ub807\ub2e4. \uc704 \uc608\uc81c\uc5d0\uc11c z1\uc740 \ub2e4\ub978 encoding\uc758 \uac12\uc744 \uc77c\ubd80 \ud3ec\ud568\ud558\uc9c0\ub9cc \uc790\uae30 \uc790\uc2e0\uc774 \uac00\uc7a5 \ud070 \uc601\ud5a5\uc744 \ubbf8\uce5c\ub2e4. \uc608\ub97c \ub4e4\uc5b4 &#8220;The animal didn&#8217;t across the street because it was to tired&#8221;\ub77c\ub294 \ubb38\uc7a5\uc744 \ubc88\uc5ed\ud560 \ub54c &#8220;it&#8221;\uc774 \uac00\ub9ac\ud0a4\ub294 \ub2e8\uc5b4\uac00 \ubb34\uc5c7\uc778\uc9c0 \uc54c \uc218 \uc788\ub2e4\uba74 \ub3c4\uc6c0\uc774 \ub420 \uac83\uc774\ub2e4.<\/li>\n\n\n\n<li>attention layer\uc5d0\uc11c \ub2e4\uc911\uc758 &#8220;representation subspace&#8221;\ub97c \uc81c\uacf5\ud55c\ub2e4. \uc774\ud6c4\uc5d0 \uc0b4\ud3b4 \ubcfc \uac83\uc774\uc9c0\ub9cc multi-headed attention\uc744 \ud1b5\ud574 \uc5ec\ub7ec\uac1c\uc758 Query\/Key\/Value \uac00\uc911\uce58 matrix(weighted matrices)\ub97c \uac00\uc9c8 \uc218 \uc788\ub2e4.(Transformer\ub294 8\uac1c\uc758 attention head\ub97c \uac00\uc9c0\uba70 \ub530\ub77c\uc11c 8\uac1c\uc758 \uac01\uac01 encoder\/decoder\ub97c \uac00\uc9c0\uac8c \ub41c\ub2e4.) \uc774\ub4e4 Query\/Key\/Value \uac00\uc911\uce58\ub4e4\uc740 \ucc98\uc74c random\ud558\uac8c initialize\ub418\uba70 training \ub41c \uc774\ud6c4 \uc785\ub825 embedding\uacfc\uc758 projection \uc5f0\uc0b0\uc744 \ud1b5\ud574  \uac01\uae30 \ub2e4\ub978 representation subspace\ub85c \ubcc0\ud658\ud558\uae30 \uc704\ud574 \uc0ac\uc6a9\ub41c\ub2e4.<\/li>\n<\/ol>\n\n\n<div class=\"wp-block-image\">\n<figure class=\"aligncenter size-large is-resized\"><img decoding=\"async\" src=\"http:\/\/jalammar.github.io\/images\/t\/transformer_attention_heads_qkv.png\" alt=\"\" style=\"width:737px;height:auto\"\/><figcaption class=\"wp-element-caption\">With multi-headed attention, we maintain separate Q\/K\/V weight matrices for each head resulting in different Q\/K\/V matrices. As we did before, we multiply X by the WQ\/WK\/WV matrices to produce Q\/K\/V matrices.<\/figcaption><\/figure>\n<\/div>\n\n\n<p>\uc704\uc5d0\uc11c \uc124\uba85\ud55c \uac83\ucc98\ub7fc \uc5ec\ub35f \ubc88\uc758  \uc11c\ub85c \ub2e4\ub978 \uac00\uc911\uce58 matrix\uc640 self-attention \uc5f0\uc0b0\uc744 \ud558\uba74 \uc544\ub798\uc640 \uac19\uc774 \uc11c\ub85c \ub2e4\uc740 z matrix\ub97c \uc5bb\uac8c \ub41c\ub2e4.<\/p>\n\n\n<div class=\"wp-block-image\">\n<figure class=\"aligncenter size-large is-resized\"><img decoding=\"async\" src=\"http:\/\/jalammar.github.io\/images\/t\/transformer_attention_heads_z.png\" alt=\"\" style=\"width:655px;height:auto\"\/><\/figure>\n<\/div>\n\n\n<p>Feed-forward layer\ub294 8\uac1c\uc758 \uac1c\ubcc4 matrix\uac00 \uc544\ub2c8\ub77c \ud558\ub098\uc758 matrix\ub97c \uc785\ub825\uc73c\ub85c \ubc1b\uc73c\ubbc0\ub85c \ud558\ub098\uc758 matrix\ub85c \uc904\uc774\ub294 \ubc29\ubc95\uc774 \ud544\uc694\ud558\ub2e4. \uc5b4\ub5bb\uac8c \ud558\uba74 \ub420\uae4c? 8\uac1c\uc758 matrix\ub97c \ud558\ub098\ub85c \uc5f0\uacb0\ud55c \ub2e4\uc74c \ucd94\uac00\uc801\uc778 \uac00\uc911\uce58 matrix WO\uc640 \uacf1\ud558\uba74 \ub41c\ub2e4.<\/p>\n\n\n<div class=\"wp-block-image\">\n<figure class=\"aligncenter size-large is-resized\"><img decoding=\"async\" src=\"http:\/\/jalammar.github.io\/images\/t\/transformer_attention_heads_weight_matrix_o.png\" alt=\"\" style=\"width:735px;height:auto\"\/><\/figure>\n<\/div>\n\n\n<p>\uc774\uac83\uc774 multi-head attention\uc758 \uc8fc\uc694 \ub0b4\uc6a9\uc774\ub77c \ud560 \uc218 \uc788\ub2e4. \uc5ec\ub824 \uac1c\uc758 matrix\ub97c \ubcfc \uc218 \uc788\ub294\ub370 \ud558\ub098\uc758 \uadf8\ub9bc\uc73c\ub85c \uc2dc\uac01\ud654 \ud558\uba74 \uc544\ub798\uc640 \uac19\ub2e4.<\/p>\n\n\n<div class=\"wp-block-image\">\n<figure class=\"aligncenter size-large\"><img decoding=\"async\" src=\"http:\/\/jalammar.github.io\/images\/t\/transformer_multi-headed_self-attention-recap.png\" alt=\"\"\/><\/figure>\n<\/div>\n\n\n<p>attention head\uc5d0 \ub300\ud574 \uc0b4\ud3b4 \ubd24\uc73c\ub2c8 \uc774\uc81c \uc55e\uc758 \uc608\uc81c\ub97c \uae30\ubc18\uc73c\ub85c \ubb38\uc7a5\uc5d0\uc11c &#8220;it&#8221; \ub2e8\uc5b4\ub97c encoding\ud560 \ub54c attention head\uac00 \uc5b4\ub514\ub97c \uc9d1\uc911\ud558\ub294\uc9c0 \uadf8\ub9bc\uc73c\ub85c \uc0b4\ud3b4\ubcf4\uc790.<\/p>\n\n\n<div class=\"wp-block-image\">\n<figure class=\"aligncenter size-large is-resized\"><img decoding=\"async\" src=\"http:\/\/jalammar.github.io\/images\/t\/transformer_self-attention_visualization_2.png\" alt=\"\" style=\"width:491px;height:auto\"\/><figcaption class=\"wp-element-caption\">As we encode the word &#8220;it&#8221;, one attention head is focusing most on &#8220;the animal&#8221;, while another is focusing on &#8220;tired&#8221; &#8212; in a sense, the model&#8217;s representation of the word &#8220;it&#8221; bakes in some of the representation of both &#8220;animal&#8221; and &#8220;tired&#8221;.<\/figcaption><\/figure>\n<\/div>\n\n\n<p>\uc5ec\uae30\uc11c \ubaa8\ub4e0 attention head\ub97c \uadf8\ub9bc\uc73c\ub85c \ud45c\ud604\ud574\ubcf4\uba74 \ud574\uc11d\ud558\uae30 \uc5b4\ub824\uc6cc \uc9c8 \uc218 \uc788\ub2e4.<\/p>\n\n\n<div class=\"wp-block-image\">\n<figure class=\"aligncenter size-large is-resized\"><img decoding=\"async\" src=\"http:\/\/jalammar.github.io\/images\/t\/transformer_self-attention_visualization_3.png\" alt=\"\" style=\"width:456px;height:auto\"\/><\/figure>\n<\/div>\n\n\n<h2 class=\"wp-block-heading\">Representing The Order of The Sequence Using Positional Encoding<\/h2>\n\n\n\n<p>\uc55e\uc5d0\uc11c \uc124\uba85\ud55c \ubaa8\ub378\uc5d0\uc11c \uc785\ub825 \ubb38\uc7a5\uc5d0\uc758 \ub2e8\uc5b4 \uc21c\uc11c\ub97c \uc124\uba85\ud558\ub294 \ubd80\ubd84\uc774 \ub204\ub77d\ub410\ub2e4. \uc774\ub97c \uc704\ud574 Transformer\ub294 \uac01 input embedding\uc5d0 \uc5b4\ub5a4 vector\ub97c \ub354\ud574 \uc900\ub2e4. \uc774 vector\ub294 \ubaa8\ub378\uc5d0\uc11c \ud559\uc2b5\ud55c \ud2b9\uc815 \ud328\ud134\uc744 \ub530\ub974\uba70, \uc774\ub294 \uac01 \ub2e8\uc5b4\uc758 \uc704\uce58\uc640 \ubb38\uc7a5 \ub0b4\uc5d0\uc11c \ub2e8\uc5b4\ub4e4\uac04\uc758 \uac70\ub9ac\ub97c \ud30c\uc545\ud558\ub294\ub370 \ub3c4\uc6c0\uc744 \uc900\ub2e4. \uc774\ub807\uac8c \ud558\ub294 \uac1c\ub150\uc801 \ubc30\uacbd\uc740 \uc774 vector \ub97c embedding\uc5d0 \ub354\ud574 \uc8fc\uba74 Q\/K\/V vector\uc5d0 \uac01\uac01 \ud22c\uc601(projection)\ub418\uace0 dot-product attention \uacfc\uc815\uc5d0\uc11c embedding vector\ub4e4 \uac04\uc5d0 \uc758\ubbf8\uc801\uc778 \uac70\ub9ac(distance)\ub97c \uc81c\uacf5\ud574\uc8fc\uae30 \ub54c\ubb38\uc774\ub2e4.<\/p>\n\n\n<div class=\"wp-block-image\">\n<figure class=\"aligncenter size-large is-resized\"><img decoding=\"async\" src=\"http:\/\/jalammar.github.io\/images\/t\/transformer_positional_encoding_vectors.png\" alt=\"\" style=\"width:733px;height:auto\"\/><figcaption class=\"wp-element-caption\">To give the model a sense of the order of the words, we add positional encoding vectors &#8212; the values of which follow a specific pattern.<\/figcaption><\/figure>\n<\/div>\n\n\n<p>embedding\uc774 4\ucc28\uc6d0\uc73c\ub85c \uc774\ub8e8\uc5b4\uc84c\ub2e4\uace0 \uac00\uc815\ud558\uba74 \uc2e4\uc81c \uc77c\uc5b4\ub098\ub294 positional encoding\uc740 \uc544\ub798\uc640 \uac19\ub2e4.<\/p>\n\n\n<div class=\"wp-block-image\">\n<figure class=\"aligncenter size-large is-resized\"><img decoding=\"async\" src=\"http:\/\/jalammar.github.io\/images\/t\/transformer_positional_encoding_example.png\" alt=\"\" style=\"width:724px;height:auto\"\/><figcaption class=\"wp-element-caption\">A real example of positional encoding with a toy embedding size of 4<\/figcaption><\/figure>\n<\/div>\n\n\n<p>\uc774 \uc21c\uc11c\ub97c \ub098\ud0c0\ub0b4\ub294 vector\ub294 \uc5b4\ub5a4 \ud615\ud0dc\ub97c \ub760\uace0 \uc788\uc744\uae4c?<\/p>\n\n\n\n<p>\uc544\ub798 \uadf8\ub9bc\uc5d0\uc11c \uac01 \ud589\uc740 vector\uc758 positional encoding\uc5d0 \ud574\ub2f9\ud55c\ub2e4. \ub530\ub77c\uc11c, \uccab\ubc88\uc9f8 \ud589\uc740 \uc785\ub825\ub41c \ubb38\uc7a5\uc5d0\uc11c \uccab\ubc88\uc9f8 \ub2e8\uc5b4\uc5d0 \ud574\ub2f9\ud558\ub294 embedding\uacfc \ub367\uc148\ud55c positional encoding vector \uc774\ub2e4. \uac01 \ud589\uc740 512\uac1c\uc758 \uac12\uc744 \ud45c\ud604\ud558\uba70 \uac01 \uac12\uc740 1 ~ -1 \uc0ac\uc774\uc758 \uac12\uc744 \uac00\uc9c4\ub2e4. \uac01 \uac12\uc744 \uc0c9\uc73c\ub85c \ud45c\ud604\ud574\ubcf4\uba74 \uc544\ub798\uc640 \uac19\uc740 \ud328\ud134\uc744 \ud655\uc778\ud560 \uc218 \uc788\ub2e4.<\/p>\n\n\n<div class=\"wp-block-image\">\n<figure class=\"aligncenter size-large is-resized\"><img decoding=\"async\" src=\"http:\/\/jalammar.github.io\/images\/t\/transformer_positional_encoding_large_example.png\" alt=\"\" style=\"width:717px;height:auto\"\/><figcaption class=\"wp-element-caption\">A real example of positional encoding for 20 words (rows) with an embedding size of 512 (columns). You can see that it appears split in half down the center. That&#8217;s because the values of the left half are generated by one function (which uses sine), and the right half is generated by another function (which uses cosine). They&#8217;re then concatenated to form each of the positional encoding vectors.<\/figcaption><\/figure>\n<\/div>\n\n\n<p>positional encoding \uacf5\uc2dd\uc740 \ub17c\ubb38(section 3.5)\uc5d0 \uc124\uba85\ub418\uc5b4 \uc788\ub2e4. get_timing_signal_1d() \uc5d0\uc11c positional encoding\uc744 \uc0dd\uc131\ud558\ub294 \ucf54\ub4dc\ub97c \ud655\uc778\ud560 \uc218 \uc788\ub2e4. \uc774\ub294 \uc720\uc77c\ud55c positional encoding\ubc29\ubc95\uc774 \uc544\ub2c8\uc9c0\ub9cc \ubcf4\uc9c0 \ubabb\ud588\ub358 \uc785\ub825 sequence\uae4c\uc9c0 \ubcfc \uc218 \uc788\ub3c4\ub85d \ud655\uc7a5 \uac00\ub2a5\ud558\uac8c \ud55c\ub2e4\ub294 \uc7a5\uc810\uc744 \uc81c\uacf5\ud55c\ub2e4. (\uc989, training set\uc758 \ubb38\uc7a5 \uae38\uc774\ubcf4\ub2e4 \ub354 \uae34 \ubb38\uc7a5\uc744 \ubc88\uc5ed\ud560 \uacbd\uc6b0)<\/p>\n\n\n\n<p>(\uc800\uc790 \uc5c5\ub370\uc774\ud2b8, 2020.7) \uc704\uc5d0\uc11c \ubcf8 positional encoding\uc740 Transformer\uc758 Tensor2Tensor\ub97c \ud1b5\ud574 \uc5bb\uc740 \uac83\uc774\ub2e4. Google \ub17c\ubb38\uc5d0\uc11c \uc0ac\uc6a9\ud55c method\ub294 \ub450 \uac1c\uc758 signal\uc744 \uc9c1\uc811 \uc5f0\uacb0(concatenate)\uc2dc\ud0a8 \uac83\uc774 \uc544\ub2c8\ub77c \uc11c\ub85c \uc369\uc774\uac8c(interweave) \ud55c \ud615\ud0dc\ub77c\ub294 \uc810\uc5d0\uc11c \uc57d\uac04 \ub2e4\ub974\ub2e4. \ub2e4\uc74c \uadf8\ub9bc\uc740 \uc774\ub807\uac8c \ud560 \uacbd\uc6b0 \uc5b4\ub5bb\uac8c \uc2dc\uac01\ud654 \ub418\ub294\uc9c0\ub97c \ubcf4\uc5ec \uc900\ub2e4.<\/p>\n\n\n<div class=\"wp-block-image\">\n<figure class=\"aligncenter size-large is-resized\"><img decoding=\"async\" src=\"http:\/\/jalammar.github.io\/images\/t\/attention-is-all-you-need-positional-encoding.png\" alt=\"\" style=\"width:797px;height:auto\"\/><\/figure>\n<\/div>\n\n\n<h2 class=\"wp-block-heading\">The Residuals<\/h2>\n\n\n\n<p>\ub2e4\uc74c\uc73c\ub85c \ub118\uc5b4\uac00\uae30 \uc804\uc5d0, Encoder \uad6c\uc870\uc5d0\uc11c \uc0c1\uc138\ud558\uac8c \uc0b4\ud3b4\ubd10\uc57c \ud560 \uc810\uc740 \uac01 Encoder\uc758 sub-layer(self-attention, ffnn) \uc0ac\uc774\uc5d0 \ub0a8\uc544\uc788\ub294 \uc5f0\uacb0\uc120(\uc810\uc120\ud45c\ud604)\uc774 \uc788\ub2e4\ub294 \uac83\uc774\ub2e4. \uadf8 \uc774\ud6c4 \ub2e8\uacc4\ub85c layer-normalization\uac00 \ub530\ub77c\uc628\ub2e4.<\/p>\n\n\n<div class=\"wp-block-image\">\n<figure class=\"aligncenter size-large is-resized\"><img decoding=\"async\" src=\"http:\/\/jalammar.github.io\/images\/t\/transformer_resideual_layer_norm.png\" alt=\"\" style=\"width:580px;height:auto\"\/><\/figure>\n<\/div>\n\n\n<p>vector\uc640 self attention\uacfc \uad00\ub828\ub41c layer-normalize\uc5f0\uc0b0\uc744 \uc2dc\uac01\ud654 \ud55c\ub2e4\uba74 \uc544\ub798\uc640 \uac19\uc774 \ubcf4\uc77c \uac83\uc774\ub2e4.<\/p>\n\n\n<div class=\"wp-block-image\">\n<figure class=\"aligncenter size-large is-resized\"><img decoding=\"async\" src=\"http:\/\/jalammar.github.io\/images\/t\/transformer_resideual_layer_norm_2.png\" alt=\"\" style=\"width:587px;height:auto\"\/><\/figure>\n<\/div>\n\n\n<p>\uc774\ub7f0 \ud615\ud0dc\ub294 Decoder\uc758 sub-layer\uc5d0\ub3c4 \ub3d9\uc77c\ud558\uac8c \uc801\uc6a9\ub41c\ub2e4. 2\uac1c\uc758 encoder\uc640 decoder\uac00 stack\uc73c\ub85c \uc313\uc778 Transformer\ub97c \uc0dd\uac01\ud574 \ubcf8\ub2e4\uba74 \ub2e4\uc74c\uacfc \uac19\uc740 \uad6c\uc870\uac00 \ub420 \uac83\uc774\ub2e4.<\/p>\n\n\n<div class=\"wp-block-image\">\n<figure class=\"aligncenter size-large\"><img decoding=\"async\" src=\"http:\/\/jalammar.github.io\/images\/t\/transformer_resideual_layer_norm_3.png\" alt=\"\"\/><\/figure>\n<\/div>\n\n\n<h2 class=\"wp-block-heading\">The Decoder Side<\/h2>\n\n\n\n<p>\uc774\uc81c Encoder \ubd80\ubd84\uc5d0 \ud574\ub2f9\ud558\ub294 \uac1c\ub150\uc744 \ub300\ubd80\ubd84 \uc0b4\ud3b4 \ubd24\uc73c\ubbc0\ub85c decoder\uc5d0\uc11c \ub0b4\ubd80\uc801\uc73c\ub85c \uc5b4\ub5bb\uac8c \ub3d9\uc791\ud558\ub294\uc9c0\ub294 \uae30\ubcf8\uc801\uc73c\ub85c \uc54c \uc218 \uc788\uc9c0\ub9cc \uadf8\ub798\ub3c4, \uc804\uccb4\uc801\uc73c\ub85c \uc5b4\ub5bb\uac8c \ub3d9\uc791\ud558\ub294\uc9c0 \ud55c\ubc88 \uc0b4\ud3b4\ubcf4\uc790.<\/p>\n\n\n\n<p>Encoder\ub294 \uc785\ub825 \ubb38\uc7a5\uc744 \ud504\ub85c\uc138\uc2f1\ud558\uba74\uc11c \uc2dc\uc791\ud55c\ub2e4. \ucd5c\uc0c1\uc704\uc5d0 \uc788\ub294 Encoder\uc758 \ucd9c\ub825\uc740 attention vector \uc14b(K, V)\ub85c \ubcc0\ud615\ub41c\ub2e4. \uc774\ub4e4 vector\ub294 &#8220;encoder-decoder attention&#8221; layer\uc5d0 \uc788\ub294 \uac01 decoder\ub4e4\uc5d0 \uc758\ud574 \uc0ac\uc6a9\ub418\uba70 &#8220;encoder-decoder attention&#8221; layer\ub294 decoder\uac00 input sequence(\uc785\ub825\ubb38\uc7a5)\uc5d0\uc11c \ud574\ub2f9 \ub2e8\uc5b4\uc640 correlation\ub418\ub294 \uacf3\uc5d0 focusing\ud558\ub3c4\ub85d \ub3c4\uc6c0\uc744 \uc900\ub2e4.<\/p>\n\n\n<div class=\"wp-block-image\">\n<figure class=\"aligncenter size-large is-resized\"><img decoding=\"async\" src=\"http:\/\/jalammar.github.io\/images\/t\/transformer_decoding_1.gif\" alt=\"\" style=\"width:759px;height:auto\"\/><figcaption class=\"wp-element-caption\">After finishing the encoding phase, we begin the decoding phase. Each step in the decoding phase outputs an element from the output sequence (the English translation sentence in this case).<\/figcaption><\/figure>\n<\/div>\n\n\n<p>\uc774\ud6c4\uc758 \ub2e8\uacc4\ub294 Transformer decoder\uac00 \ucd9c\ub825\uc744 \uc885\ub8cc\ud55c\ub2e4\ub294 \uac83\uc744 \uac00\ub9ac\ud0a4\ub294 \ud2b9\ubcc4\ud55c \uae30\ud638\ub97c \ub9cc\ub0a0 \ub54c\uae4c\uc9c0 \uc774 \uacfc\uc815\uc744 \ubc18\ubcf5\ud55c\ub2e4. \uac01 \ub2e8\uacc4\uc758 \ucd9c\ub825(output)\uc740 \ub2e4\uc74c decoder \ub2e8\uacc4\uc758 \uc785\ub825\uc73c\ub85c \ub4e4\uc5b4\uac04\ub2e4. \uadf8\ub9ac\uace0 decoder\ub4e4\uc740 encoder\uc5d0\uc11c \ud588\ub358 \uac83\ucc98\ub7fc \uc790\uc2e0\uc758 \uacb0\uacfc\ub97c \uc0c1\uc704\ub85c bubble up \uc2dc\ud0a8\ub2e4. \uadf8\ub9ac\uace0, Encoder\uc758 input\uc744 \ub300\uc0c1\uc73c\ub85c \uc218\ud589\ud588\ub358 \uac83\ucc98\ub7fc decoder input\uc5d0 embedding\uc744 \uba3c\uc800 \uc218\ud589\ud558\uace0 \uac01 \ub2e8\uc5b4\uc758 \uc704\uce58\ub97c \uac00\ub9ac\ud0a4\ub9ac \uc704\ud574 positional encoding\uc744 \ub354\ud558\ub294 \uacfc\uc815\uc744 \uc218\ud589\ud55c\ub2e4.<\/p>\n\n\n<div class=\"wp-block-image\">\n<figure class=\"aligncenter size-large\"><img decoding=\"async\" src=\"http:\/\/jalammar.github.io\/images\/t\/transformer_decoding_2.gif\" alt=\"\"\/><\/figure>\n<\/div>\n\n\n<p>Decoder\uc5d0\uc11c self attention layer\ub294 encoder\uc5d0\uc11c\uc640\ub294 \uc57d\uac04 \ub2e4\ub974\uac8c \ub3d9\uc791\ud55c\ub2e4.<\/p>\n\n\n\n<p>Decoder\uc5d0\uc11c self-attention layer\ub294 output sequence\uc5d0\uc11c \ubb38\uc7a5\uc5d0\uc11c \uc55e\ucabd \uc601\uc5ed\uc73c\ub85c\ub9cc \ucc98\ub9ac\ud558\ub3c4\ub85d \ud5c8\uc6a9\ub3fc \uc788\ub2e4. \uc774\ub294 self-attention\uacfc\uc815\uc758 softmax \uc774\uc804 \ub2e8\uacc4\uc5d0\uc11c future position( -inf \ub85c \uc124\uc815)\uc744 masking\ud568\uc73c\ub85c\uc368 \uac00\ub2a5\ud558\ub2e4.<\/p>\n\n\n\n<p>&#8220;Encoder-Decoder Attention&#8221; layer\ub294 \uc544\ub798 \ub2e8 layer\uc5d0\uc11c Query matrix\ub97c \ub9cc\ub4dc\ub294 \uacfc\uc815\uc744 \uc81c\uc678\ud558\uba74 multiheaded self-attention\uacfc \ube44\uc2b7\ud558\uac8c \ub3d9\uc791\ud558\uace0 Key, Value matrix\ub294 Encoder stack\uc758 output\uc5d0\uc11c \uac00\uc838\uc628\ub2e4.<\/p>\n\n\n\n<h2 class=\"wp-block-heading\">The Final Linear and Softmax Layer<\/h2>\n\n\n\n<p>Decoder stack\uc740 float \ud615\uc758 Vector\ub97c \ucd9c\ub825\ud55c\ub2e4. \uc774 \uac12\uc744 \uc5b4\ub5bb\uac8c \ub2e8\uc5b4\ub85c \ubcc0\ud658\ud560\uae4c? \uc774\uac83\uc774 \ub9c8\uc9c0\ub9c9 \ub2e8\uc5d0 \uc788\ub294 Linear layer\uc758 \uc5ed\ud560\uc774\ub2e4. \uc774 \ub2e4\uc74c\uc5d0\ub294 Softmax Layer\ub85c \uc774\uc5b4\uc9c4\ub2e4.<\/p>\n\n\n\n<p>Linear Layer\ub294 fully connected neural network\uc774\uba70 decoder stack\uc5d0 \uc758\ud574 \ub9cc\ub4e4\uc5b4\uc9c4 vector\ub370\uc774\ud130\ub97c logit vector\ub85c \ubd88\ub9ac\ub294 \uc544\uc8fc, \uc544\uc8fc \ud070 vector\ub85c projection\ud55c\ub2e4.<\/p>\n\n\n\n<p>\uc704\uc5d0\uc11c \uc0ac\uc6a9\ud55c \ubaa8\ub378\uc774 training dataset\uc73c\ub85c\ubd80\ud130 \ud559\uc2b5\ud558\uc5ec 10,000\uac1c\uc758 \uc601\uc5b4\ub2e8\uc5b4(\ubaa8\ub378\uc758 \ucd9c\ub825 \uc5b4\ud718)\ub97c \uc54c\uace0 \uc788\ub2e4\uace0 \uac00\uc815\ud558\uba74 10,000\uac1c\uc758 \uc140\uc744 \uac00\uc9c4 logit vector\uac00 \ub9cc\ub4e4\uc5b4 \uc9c4\ub2e4. \uc774\ub54c \uac01 \uc140\uc740 unique\ud55c \ub2e8\uc5b4\uc640 \ub300\uc751\ub418\ub294 \uc810\uc218(score)\ub97c \uac00\uc9c0\uac8c \ub41c\ub2e4. \uc774\uac83\uc774 Linear layer\uc758 \uac70\uce5c \uc774\ud6c4\uc758 \ubaa8\ub378 \uacb0\uacfc(output)\ub97c \ud574\uc11d\ud558\ub294 \ubc29\ubc95\uc774\ub2e4.<\/p>\n\n\n\n<p>Softmax layer\ub294 \uc774 \uc810\uc218(score)\ub97c \ud655\ub960(probability)\ub85c \uce58\ud658\ud55c\ub2e4.(\uc591\uc218 \uac12\uc774\uba70 0 ~ 1.0 \uc0ac\uc774\uc758 \uac12). \uac00\uc7a5 \ub192\uc740 \ud655\ub960\uc744 \uac00\uc9c4 \uc140\uc774 \uc120\ud0dd\ub418\uba70 \uc774\uc640 \uc5f0\uad00\ub41c \ub2e8\uc5b4\uac00 \ud604 \ub2e8\uacc4\uc5d0\uc11c\uc758 \uacb0\uacfc\ub85c \ub9cc\ub4e4\uc5b4 \uc9c4\ub2e4.<\/p>\n\n\n<div class=\"wp-block-image\">\n<figure class=\"aligncenter size-large is-resized\"><img decoding=\"async\" src=\"http:\/\/jalammar.github.io\/images\/t\/transformer_decoder_output_softmax.png\" alt=\"\" style=\"width:756px;height:auto\"\/><figcaption class=\"wp-element-caption\">This figure starts from the bottom with the vector produced as the output of the decoder stack. It is then turned into an output word.<\/figcaption><\/figure>\n<\/div>\n\n\n<h2 class=\"wp-block-heading\">Recap Of Training<\/h2>\n\n\n\n<p>\uc9c0\uae08\uae4c\uc9c0 \ud6c8\ub828\ub41c Transformer\ub97c \ud1b5\ud574 \uc804\ubc18\uc801\uc778 forward-pass \uacfc\uc815\uc744 \uc54c\uc544\ubd24\uc73c\uba70, \ubaa8\ub378\uc744 training \ud558\ub294 \uac1c\ub150\uc744 \uc0b4\ud3b4\ubcf4\uba74 \ub3c4\uc6c0\uc774 \ub420 \uac83\uc774\ub2e4.<\/p>\n\n\n\n<p>training\ud558\ub294 \ub3d9\uc548 \uc544\uc9c1 \ud559\uc2b5\uc548\ub41c \ubaa8\ub378\uc740 \uc815\ud655\ud788 \ub3d9\uc77c\ud55c forward pass\ub97c \uac70\uce58\uac8c \ub41c\ub2e4. \uadf8\ub7ec\ub098, label\ub41c training set\uc73c\ub85c training\ud558\uae30 \ub54c\ubb38\uc5d0 \uacb0\uacfc \uac12\uc744(output)\uc744 \uc2e4\uc81c \uc815\ub2f5\uacfc \ube44\uad50\ud560 \uc218 \uc788\ub2e4.<\/p>\n\n\n\n<p>\uc774\ub97c \uc2dc\uac01\ud654 \ud558\uae30 \uc704\ud574 \uacb0\uacfc \uc5b4\ud718\ub85c 6\uac1c\uc758 \ub2e8\uc5b4\ub9cc \uc0ac\uc6a9\ud55c\ub2e4\uace0 \uac00\uc815\ud558\uc790(&#8220;a&#8221;, &#8220;am&#8221;, &#8220;i&#8221;, &#8220;thanks&#8221;, &#8220;student&#8221;, &#8220;&lt;eos&gt;&#8221; (end of sentence))<\/p>\n\n\n<div class=\"wp-block-image\">\n<figure class=\"aligncenter size-large is-resized\"><img decoding=\"async\" src=\"http:\/\/jalammar.github.io\/images\/t\/vocabulary.png\" alt=\"\" style=\"width:696px;height:auto\"\/><figcaption class=\"wp-element-caption\">The output vocabulary of our model is created in the preprocessing phase before we even begin training.<\/figcaption><\/figure>\n<\/div>\n\n\n<p>\ucd9c\ub825 \uc5b4\ud718\ub97c \uc815\uc758\ud558\uba74 \uac01 \ub2e8\uc5b4\ub97c \uc9c0\uc815\ud558\uae30 \uc704\ud574 \ub3d9\uc77c\ud55c \ud06c\uae30\uc758 vector\ub97c \uc0ac\uc6a9\ud560 \uc218 \uc788\ub2e4. \uc774\uac83\uc744 on-hot encoding\uc774\ub77c\uace0 \ud55c\ub2e4. \uadf8\ub798\uc11c \uc608\ub97c \ub4e4\uba74, &#8220;am&#8221;\ub2e8\uc5b4\ub97c \uc544\ub798\uc758 vector\uc640 \uac19\uc774 \uc9c0\uc815\ud560 \uc218 \uc788\ub2e4.<\/p>\n\n\n<div class=\"wp-block-image\">\n<figure class=\"aligncenter size-large is-resized\"><img decoding=\"async\" src=\"http:\/\/jalammar.github.io\/images\/t\/one-hot-vocabulary-example.png\" alt=\"\" style=\"width:631px;height:auto\"\/><figcaption class=\"wp-element-caption\">Example: one-hot encoding of our output vocabulary<\/figcaption><\/figure>\n<\/div>\n\n\n<p>\ub2e4\uc74c\uc73c\ub85c, \ubaa8\ub378\uc758 loss function\uc5d0 \ub300\ud574 \uc54c\uc544\ubcf4\uc790 &#8211; training\ub2e8\uacc4\uc5d0\uc11c optimize\ud558\uae30 \uc704\ud574 \uc0ac\uc6a9\ud558\ub294 \uce21\uc815\uc218\ub2e8\uc73c\ub85c \ub9e4\uc6b0 \uc815\ud655\ud55c \ubaa8\ub378\uc774 \ub9cc\ub4e4\uc5b4\uc9c0\ub3c4\ub85d \ud558\ub294 \uc5ed\ud560\uc744 \uc218\ud589\ud55c\ub2e4.<\/p>\n\n\n\n<h2 class=\"wp-block-heading\">The Loss Function<\/h2>\n\n\n\n<p>\ubaa8\ub378\uc744 training\ud55c\ub2e4\uace0 \ud574\ubcf4\uc790. training\uacfc\uc815\uc5d0\uc11c \uccab\ubc88\uc9f8 \ub2e8\uacc4\ub85c\uc368 \uc544\uc8fc \uac04\ub2e8\ud55c \uc608\uc81c(&#8220;merci&#8221;\ub97c &#8220;thanks&#8221;\ub85c \ubc88\uc5ed)\uc5d0 \ub300\ud574 training\ud55c\ub2e4\uace0 \uac00\uc815\ud574\ubcf4\uc790.<\/p>\n\n\n\n<p>\uc774\uac83\uc774 \uc758\ubbf8\ud558\ub294 &#8220;thanks&#8221; \ub2e8\uc5b4\ub97c \uac00\ub9ac\ud0a4\ub294 \ud655\ub960\uac12\uc758 \ubd84\ud3ec\ub97c \uacb0\uacfc\ub85c \uc6d0\ud55c\ub2e4\ub294 \uac83\uc774\ub2e4. \uadf8\ub7ec\ub098 \uc774 \ubaa8\ub378\uc740 \uc544\uc9c1 train\ub418\uc9c0 \uc54a\uc558\uc73c\ubbc0\ub85c \uc544\uc9c1 \uc774\ub7f0 \uacb0\uacfc\uac00 \ubc1c\uc0dd\ub418\uc9c0 \uc54a\uc744 \uac83\uc774\ub2e4.<\/p>\n\n\n<div class=\"wp-block-image\">\n<figure class=\"aligncenter size-large is-resized\"><img decoding=\"async\" src=\"http:\/\/jalammar.github.io\/images\/t\/transformer_logits_output_and_label.png\" alt=\"\" style=\"width:645px;height:auto\"\/><figcaption class=\"wp-element-caption\">Since the model&#8217;s parameters (weights) are all initialized randomly, the (untrained) model produces a probability distribution with arbitrary values for each cell\/word. We can compare it with the actual output, then tweak all the model&#8217;s weights using backpropagation to make the output closer to the desired output.<\/figcaption><\/figure>\n<\/div>\n\n\n<p>\uadf8\ub807\ub2e4\uba74 \uc774 \ub450\uac1c\uc758 \ud655\ub960\uc744 \uc5b4\ub5bb\uac8c \ube44\uad50\ud560\uae4c? \uc5ec\uae30\uc11c\ub294 \ub2e8\uc21c\ud788 \uac01 \ud655\ub960\uac12\uc744 \ube84\uc148\ud588\ub2e4. \ub354 \uc790\uc138\ud55c \ub0b4\uc6a9\uc740 <a href=\"https:\/\/colah.github.io\/posts\/2015-09-Visual-Information\/\">cross-entropy<\/a>\uc640 <a href=\"https:\/\/www.countbayesie.com\/blog\/2017\/5\/9\/kullback-leibler-divergence-explained\">Kullback\u2013Leibler divergence<\/a>\uc744 \ucc38\uace0\ud558\uae30 \ubc14\ub780\ub2e4.<\/p>\n\n\n\n<p>\uadf8\ub7ec\ub098 \uc774\uac83\uc740 \ub9e4\uc6b0 \ub2e8\uc21c\ud654\uc2dc\ud0a8 \uc608\uc81c\ub77c\ub294 \uac83\uc5d0 \uc8fc\ubaa9\ud558\uc790. \uc880 \ub354 \ud604\uc2e4\uc801\uc73c\ub85c &#8220;je suis \u00e9tudiant&#8221;\uc640 \uac19\uc774 \ud55c \ub2e8\uc5b4\ubcf4\ub2e4 \ub354 \uae38 \ubb38\uc7a5\uc744 \uc785\ub825\uc73c\ub85c \uc0ac\uc6a9\ud560 \uac83\uc774\uba70 \ucd9c\ub825\uc740 &#8220;i am a student&#8221;\uac00 \ub420 \uac83 \uc774\ub2e4. \uc774\uac83\uc774 \uc758\ubbf8\ud558\ub294 \uac83\uc740 \ubaa8\ub378\uc774 \uc5f0\uc18d\uc801\uc73c\ub85c \ud655\ub960\uc758 \ubd84\ud3ec\ub97c \ucd9c\ub825\ud55c\ub2e4\ub294 \uac83\uc744 \uae30\ub300\ud55c\ub2e4\ub294 \uac83\uc774\ub2e4. \uc5ec\uae30\uc11c<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li>\uac01 \ud655\ub960\uc758 \ubd84\ud3ec\ub294 vocab_size\ub85c \ud45c\ud604\ub418\ub294 vector\uc758 \ud06c\uae30(width)\ub85c \ud45c\ud604\ub41c\ub2e4.(toy example\uc5d0\uc11c 6\uc744 \uc0ac\uc6a9\ud558\uc9c0\ub9cc \ud604\uc2e4\uc801\uc73c\ub85c\ub294 30,000 \ub610\ub294 50,000\uc744 \uc0ac\uc6a9\ud55c\ub2e4)<\/li>\n\n\n\n<li>\ucc98\uc74c \ud655\ub960\uc758 \ubd84\ud3ec\ub294 &#8220;i&#8221;\ub2e8\uc5b4\uac00 \uc788\ub294 \uc140\uc5d0\uc11c\uc758 \ud655\ub960\uc774 \uac00\uc7a5 \ub192\ub2e4.<\/li>\n\n\n\n<li>\ub450\ubc88\uc9f8 \ud655\ub960\uc758 \ubd84\ud3ec\ub294 &#8220;am&#8221;\ub2e8\uc5b4\uac00 \uc788\ub294 \uc140\uc5d0\uc11c\uc758 \ud655\ub960\uc774 \uac00\uc7a5 \ub192\ub2e4.<\/li>\n\n\n\n<li>\uc774\ub807\uac8c \uacc4\uc18d\ud574\uc11c \ub2e4\uc12f \ubc88\uca30\ub85c \ucd9c\ub825\ub418\ub294 \ud655\ub960\uc758 \ubd84\ud3ec\uac00 &#8216;&lt;end of sentence&gt;&#8217;\uae30\ud638\ub97c \uac00\ub9ac\ud0ac \ub54c\uae4c\uc9c0 \uc9c4\ud589\ud558\uba70 \uc774 \ub610\ud55c 10,000\uac1c\uc758 \uc5b4\ud718 \uc911\uc5d0\uc11c \ud558\ub098\uc640 \ub9e4\ud551\ub41c\ub2e4.<\/li>\n<\/ul>\n\n\n<div class=\"wp-block-image\">\n<figure class=\"aligncenter size-large is-resized\"><img decoding=\"async\" src=\"http:\/\/jalammar.github.io\/images\/t\/output_target_probability_distributions.png\" alt=\"\" style=\"width:681px;height:auto\"\/><figcaption class=\"wp-element-caption\">The targeted probability distributions we&#8217;ll train our model against in the training example for one sample sentence.<\/figcaption><\/figure>\n<\/div>\n\n\n<p>\ucda9\ubd84\ud788 \ud070 dataset\uc73c\ub85c training\ud55c \ud6c4\uc5d0 \ub3c4\ucd9c\ub41c \ud655\ub960\uc758 \ubd84\ud3ec\ub294 \uc544\ub798\uc640 \uac19\uc744 \uac83\uc774\ub77c\uace0 \uae30\ub300\ud560 \uc218 \uc788\ub2e4.<\/p>\n\n\n<div class=\"wp-block-image\">\n<figure class=\"aligncenter size-large is-resized\"><img decoding=\"async\" src=\"http:\/\/jalammar.github.io\/images\/t\/output_trained_model_probability_distributions.png\" alt=\"\" style=\"width:675px;height:auto\"\/><figcaption class=\"wp-element-caption\">Hopefully upon training, the model would output the right translation we expect. Of course it&#8217;s no real indication if this phrase was part of the training dataset (see:&nbsp;<a href=\"https:\/\/www.youtube.com\/watch?v=TIgfjmp-4BA\">cross validation<\/a>). Notice that every position gets a little bit of probability even if it&#8217;s unlikely to be the output of that time step &#8212; that&#8217;s a very useful property of softmax which helps the training process.<\/figcaption><\/figure>\n<\/div>\n\n\n<p>\uc774\uc81c \ubaa8\ub378\uc774 \ud55c\ubc88\uc5d0 \ud558\ub098\uc529 \uacb0\uacfc\ub97c \ub3c4\ucd9c\ud574 \ub0b4\uae30 \ub54c\ubb38\uc5d0 \ubaa8\ub378\uc774 \ud655\ub960\uc758 \ubd84\ud3ec \uc911\uc5d0\uc11c \uac00\uc7a5 \ub192\uc740 \ud655\ub960\uc744 \uac00\uc9c0\ub294 \ub2e8\uc5b4\ub97c \uc120\ud0dd\ud558\uace0 \ub098\uba38\uc9c0\ub294 \ubc84\ub9ac\ub294 \uac83\uc774\ub77c\uace0 \uac00\uc815\ud560 \uc218 \uc788\ub2e4. \uc774\uac83\uc774 \ub3d9\uc791 \ubc29\ubc95 \uc911\uc758 \ud558\ub098(greedy decoding)\uc774\uba70, \ub2e4\ub978 \ubc29\ubc95\uc740 \ucd5c\uc0c1\uc704 2\uac1c\uc758 \ub2e8\uc5b4(\uc608: &#8220;I&#8217;, \uc640 &#8220;a&#8221;)\ub97c \ucde8\ud558\uace0 \ub2e4\uc74c \ub2e8\uacc4\uc5d0\uc11c \ubaa8\ub378\uc744 \ub450\ubc88 \uc2e4\ud589\ud558\ub294 \uac83\uc774\ub2e4: \uccab\ubc88\uc9f8 \ucd9c\ub825\uc774 \ub2e8\uc5b4 &#8220;I&#8221;\ub77c\uace0 \uac00\uc815\ud558\uace0 \ub2e4\uc74c \ubc88\uc758 \uccab\ubc88\uc9f8 \ucd9c\ub825\uc774 \ub2e8\uc5b4 &#8220;a&#8221;\ub77c\uace0 \uac00\uc815\ud558\uace0 #1\ubc88\uca30\uc640 #2\ubc88\uc9f8\ub97c \uac10\uc548\ud574\uc11c \uc624\ub958\uac00 \uc801\uac8c \ubc1c\uc0dd\ud55c \ubc84\uc804\uc774 \uc720\uc9c0\ub41c\ub2e4\uace0 \uac00\uc815\ud560 \uacbd\uc6b0. \uc774 \ubc29\ubc95\uc740 &#8220;bean search&#8221;\ub77c\uace0 \ud558\uba70 \uc608\uc81c\uc5d0\uc11c beam_size\ub294 2\uc774\ub2e4(\uc5b4\ub5a4 \uacbd\uc6b0\uc5d0\ub77c\ub3c4 \ub450 \ubd80\ubd84 \uac00\uc124(unfinished translation)\uc740 \uba54\ubaa8\ub9ac\uc5d0 \uc720\uc9c0\ub41c\ub2e4\ub294 \uac83\uc744 \uc758\ubbf8\ud55c\ub2e4), \uadf8\ub9ac\uace0 top_beam \ub610\ud55c 2\uac00 \ub41c\ub2e4(2\uac1c\uc758 translation\uc744 \ubc18\ud658\ud55c\ub2e4\ub294 \uac83\uc744 \uc758\ubbf8). \uc774\ub4e4 2\uac1c \ubaa8\ub450 \uc2e4\ud5d8\ud574 \ubcfc \uc218 \uc788\ub294 hyperparameter\uc774\ub2e4.<\/p>\n","protected":false},"excerpt":{"rendered":"<p>\ucc38\uace0: The illustrated Transformer GPT\ub294 Generative Pre-trained Transformer\uc758 \uc57d\uc790\ub77c\uace0 \uc54c\uace0 \uc788\ub2e4. \uc5ec\uae30\uc11c \uac00\uc7a5 \uc911\uc694\ud55c \uc5ed\ud560\uc744 \ud558\ub294 \uac83\uc774 Transformer\uc77c \uac83\uc774\ub2e4. \uadf8\ub807\ub2e4\uba74 Transformer\uac00 \uc5b4\ub5a4 \uae30\ub2a5\uc744 \ud558\uae30\uc5d0 \uac00\uc7a5 \uc911\uc694\ud55c\uc9c0 \uad81\uae08\uc99d\uc744 \uac00\uc9c0\uc9c0 \uc54a\uc744 \uc218 \uc5c6\ub2e4. \ub17c\ubb38\uc758 \ud3ec\ud568\ud574\uc11c \uc5ec\ub7ec \uc790\ub8cc\ub97c \uc0b4\ud3b4\ubd10\ub3c4 \uae00\uc790\ub9cc \ubcf4\uc774\uc9c0 \ubb38\ub9e5\uc774 \ubcf4\uc774\uc9c0 \uc54a\uc558\ub294\ub370 \uc774 \ubb38\uc11c\ub97c \ubcf4\uace0\uc11c \ubb34\ub985\uc744 \ud0c1 \uce58\uac8c \ub418\uc5c8\ub2e4. \uc6b0\uc120 \uc26c\uc6b4 \uc124\uba85\uc744 \ud1b5\ud574 \uad81\uae08\uc99d\uc744 \ud574\uacb0\ud558\ub3c4\ub85d \ud574 \uc900 Jay Alammar\uc5d0\uac8c \uac10\uc0ac\ub97c \ub4dc\ub9ac\uace0 \ub098\uc758 \uc5b8\uc5b4\ub85c \ub2e4\uc2dc \uc815\ub9ac\ud574 \ubcf4\uace0\uc790 \ud55c\ub2e4. \ub098\uc911\uc5d0 \uc790\uc138\ud558\uac8c \uc124\uba85\ud558\uaca0\uc9c0\ub9cc Transformer\ub294 \ub0b4\ubd80\uc801\uc73c\ub85c Attention\uc774\ub77c\ub294 \uac1c\ub150\uc744 \uc0ac\uc6a9\ud55c\ub2e4. \uc774\ub97c \uc0ac\uc6a9\ud568\uc73c\ub85c\uc368 \uadf8\ub3d9\uc548 \ubb38\uc81c\ub85c \uc9c0\uc801\ub410\ub358 \ub290\ub9b0&#8230;<\/p>\n<p class=\"read-more\"><a class=\"btn btn-default\" href=\"https:\/\/skanto.co.kr\/?p=703\"> Read More<span class=\"screen-reader-text\">  Read More<\/span><\/a><\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"_import_markdown_pro_load_document_selector":0,"_import_markdown_pro_submit_text_textarea":"","footnotes":""},"categories":[14,7],"tags":[48,87,49,92],"class_list":["post-703","post","type-post","status-publish","format-standard","hentry","category-sw-development","category-7","tag-ai","tag-chatgpt","tag-gpt","tag-transformer"],"_links":{"self":[{"href":"https:\/\/skanto.co.kr\/index.php?rest_route=\/wp\/v2\/posts\/703","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/skanto.co.kr\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/skanto.co.kr\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/skanto.co.kr\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/skanto.co.kr\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=703"}],"version-history":[{"count":117,"href":"https:\/\/skanto.co.kr\/index.php?rest_route=\/wp\/v2\/posts\/703\/revisions"}],"predecessor-version":[{"id":1040,"href":"https:\/\/skanto.co.kr\/index.php?rest_route=\/wp\/v2\/posts\/703\/revisions\/1040"}],"wp:attachment":[{"href":"https:\/\/skanto.co.kr\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=703"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/skanto.co.kr\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=703"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/skanto.co.kr\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=703"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}