{"id":23027,"date":"2026-04-25T01:23:48","date_gmt":"2026-04-25T01:23:48","guid":{"rendered":"https:\/\/umang.pk\/2026\/04\/25\/%d8%a7%d9%be%d9%86%db%8c-%d8%b2%d8%a8%d8%a7%d9%86-%da%a9%db%92-%d9%84%db%8c%db%92-%d9%85%d8%ae%d8%b5%d9%88%d8%b5-%d8%a7%db%8c%d9%84-%d8%a7%db%8c%d9%84-%d8%a7%db%8c%d9%85-%da%a9%db%8c%d8%b3%db%92\/"},"modified":"2026-04-25T01:23:48","modified_gmt":"2026-04-25T01:23:48","slug":"%d8%a7%d9%be%d9%86%db%8c-%d8%b2%d8%a8%d8%a7%d9%86-%da%a9%db%92-%d9%84%db%8c%db%92-%d9%85%d8%ae%d8%b5%d9%88%d8%b5-%d8%a7%db%8c%d9%84-%d8%a7%db%8c%d9%84-%d8%a7%db%8c%d9%85-%da%a9%db%8c%d8%b3%db%92","status":"publish","type":"post","link":"https:\/\/umang.pk\/en_us\/2026\/04\/25\/%d8%a7%d9%be%d9%86%db%8c-%d8%b2%d8%a8%d8%a7%d9%86-%da%a9%db%92-%d9%84%db%8c%db%92-%d9%85%d8%ae%d8%b5%d9%88%d8%b5-%d8%a7%db%8c%d9%84-%d8%a7%db%8c%d9%84-%d8%a7%db%8c%d9%85-%da%a9%db%8c%d8%b3%db%92\/","title":{"rendered":"\u0627\u067e\u0646\u06cc \u0632\u0628\u0627\u0646 \u06a9\u06d2 \u0644\u06cc\u06d2 \u0645\u062e\u0635\u0648\u0635 \u0627\u06cc\u0644 \u0627\u06cc\u0644 \u0627\u06cc\u0645 \u06a9\u06cc\u0633\u06d2 \u0628\u0646\u0627\u0626\u06cc\u06ba [Full Handbook]"},"content":{"rendered":"\n<div id=\"\">\n<p>\u06a9\u06cc\u0627 \u06c1\u0648\u06af\u0627 \u0627\u06af\u0631 \u0622\u067e \u0627\u067e\u0646\u06cc \u0645\u0627\u062f\u0631\u06cc \u0632\u0628\u0627\u0646 \u06a9\u0627 \u0627\u0633\u062a\u0639\u0645\u0627\u0644 \u06a9\u0631\u062a\u06d2 \u06c1\u0648\u0626\u06d2 \u0634\u0631\u0648\u0639 \u0633\u06d2 \u0627\u067e\u0646\u0627 LLM \u0628\u0646\u0627 \u0633\u06a9\u062a\u06d2 \u06c1\u06cc\u06ba\u061f \u0628\u0627\u0644\u06a9\u0644 \u06cc\u06c1\u06cc \u06c1\u06d2 \u062c\u0648 \u06c1\u0645 \u0627\u0633 \u0679\u06cc\u0648\u0679\u0648\u0631\u06cc\u0644 \u0645\u06cc\u06ba \u06a9\u0631\u06cc\u06ba \u06af\u06d2\u06d4 \u06cc\u06c1 \u0633\u0645\u062c\u06be\u0646\u06d2 \u06a9\u0627 \u0628\u06c1\u062a\u0631\u06cc\u0646 \u0637\u0631\u06cc\u0642\u06c1 \u06c1\u06d2 \u06a9\u06c1 \u0627\u06cc\u0644 \u0627\u06cc\u0644 \u0627\u06cc\u0645 \u06a9\u06cc\u0633\u06d2 \u06a9\u0627\u0645 \u06a9\u0631\u062a\u0627 \u06c1\u06d2 \u062f\u0631\u0627\u0635\u0644 \u0627\u0633\u06d2 \u0628\u0646\u0627\u0646\u0627 \u06c1\u06d2\u06d4<\/p>\n<p>\u0622\u0626\u06cc\u06d2 \u0627\u06cc\u06a9 \u0645\u062e\u0635\u0648\u0635 \u0632\u0628\u0627\u0646 \u0645\u06cc\u06ba \u0627\u067e\u0646\u0627 \u0627\u06cc\u0644 \u0627\u06cc\u0644 \u0627\u06cc\u0645 \u0628\u0646\u0627\u0646\u06d2 \u06a9\u06d2 \u06c1\u0631 \u0645\u0631\u062d\u0644\u06d2 \u0633\u06d2 \u06af\u0632\u0631\u062a\u06d2 \u06c1\u06cc\u06ba (\u0627\u0633 \u0645\u0639\u0627\u0645\u0644\u06d2 \u0645\u06cc\u06ba \u0627\u0631\u062f\u0648)\u06d4 \u0627\u0633 \u0633\u06d2 \u0622\u067e \u06a9\u0648 \u06cc\u06c1 \u0633\u0645\u062c\u06be\u0646\u06d2 \u0645\u06cc\u06ba \u0645\u062f\u062f \u0645\u0644\u06d2 \u06af\u06cc \u06a9\u06c1 LLM \u06a9\u06d2 \u0627\u0646\u062f\u0631 \u06a9\u06cc\u0627 \u06c1\u0648\u062a\u0627 \u06c1\u06d2\u06d4<\/p>\n<p>\u062c\u062f\u06cc\u062f LLM \u062a\u062d\u0642\u06cc\u0642\u06cc \u0645\u0642\u0627\u0644\u06d2 \u0633\u06d2 \u062a\u0639\u0644\u0642 \u0631\u06a9\u06be\u062a\u0627 \u06c1\u06d2 \u062c\u0633 \u0646\u06d2 \u0633\u0628 \u06a9\u0686\u06be \u0628\u062f\u0644 \u062f\u06cc\u0627\u06d4 <strong>&quot;\u06cc\u06c1 \u0633\u0628 \u06a9\u0686\u06be \u06c1\u06d2 \u062c\u0633 \u067e\u0631 \u062a\u0648\u062c\u06c1 \u062f\u06cc\u0646\u06d2 \u06a9\u06cc \u0636\u0631\u0648\u0631\u062a \u06c1\u06d2\u06d4&#8221;<\/strong>. \u0644\u06cc\u06a9\u0646 \u0631\u06cc\u0627\u0636\u06cc \u0645\u06cc\u06ba \u06a9\u06be\u0648 \u062c\u0627\u0646\u06d2 \u06a9\u06d2 \u0628\u062c\u0627\u0626\u06d2 (\u0645\u06cc\u06ba \u0631\u06cc\u0627\u0636\u06cc \u0645\u06cc\u06ba \u0627\u0686\u06be\u0627 \u0646\u06c1\u06cc\u06ba \u06c1\u0648\u06ba)\u060c \u0645\u06cc\u06ba \u0627\u0633\u06d2 \u0634\u0631\u0648\u0639 \u0633\u06d2 \u0628\u0646\u0627 \u06a9\u0631 \u0633\u06cc\u06a9\u06be\u0646\u06d2 \u062c\u0627 \u0631\u06c1\u0627 \u06c1\u0648\u06ba\u06d4<\/p>\n<h3 id=\"heading-who-is-this-handbook-for\">\u06cc\u06c1 \u06a9\u062a\u0627\u0628\u0686\u06c1 \u06a9\u0633 \u06a9\u06d2 \u0644\u06cc\u06d2 \u06c1\u06d2\u061f<\/h3>\n<p>\u0633\u0627\u0641\u0679 \u0648\u06cc\u0626\u0631 \u0627\u0646\u062c\u06cc\u0646\u0626\u0631\u0632\u060c \u067e\u0631\u0648\u0688\u06a9\u0679 \u06a9\u06d2 \u0645\u0627\u0644\u06a9\u0627\u0646 \u06cc\u0627 \u06a9\u0648\u0626\u06cc \u0628\u06be\u06cc \u0627\u0633 \u0628\u0627\u0631\u06d2 \u0645\u06cc\u06ba \u062f\u0644\u0686\u0633\u067e\u06cc \u0631\u06a9\u06be\u062a\u0627 \u06c1\u06d2 \u06a9\u06c1 LLM \u06a9\u06cc\u0633\u06d2 \u06a9\u0627\u0645 \u06a9\u0631\u062a\u06d2 \u06c1\u06cc\u06ba\u06d4 \u0627\u06af\u0631 \u0622\u067e \u06a9\u06d2 \u067e\u0627\u0633 \u0645\u0634\u06cc\u0646 \u0644\u0631\u0646\u0646\u06af \u06a9\u0627 \u06a9\u0686\u06be \u0639\u0644\u0645 \u06c1\u06d2 \u062a\u0648 \u06cc\u06c1 \u0627\u0686\u06be\u0627 \u06c1\u06d2\u060c \u0644\u06cc\u06a9\u0646 \u0627\u06af\u0631 \u0622\u067e \u06a9\u06d2 \u067e\u0627\u0633 \u0646\u06c1\u06cc\u06ba \u06c1\u06d2 \u062a\u0648 \u0641\u06a9\u0631 \u0646\u06c1 \u06a9\u0631\u06cc\u06ba\u06d4 \u0645\u06cc\u06ba \u0646\u06d2 \u06cc\u06c1 \u0627\u0633 \u0644\u06cc\u06d2 \u0644\u06a9\u06be\u0627 \u06c1\u06d2 \u06a9\u06c1 \u0622\u067e \u06a9\u0648 \u0627\u0633 \u0679\u06cc\u0648\u0679\u0648\u0631\u06cc\u0644 \u0633\u06d2 \u0628\u0627\u06c1\u0631 \u062c\u0627\u0646\u06d2 \u06a9\u06cc \u0636\u0631\u0648\u0631\u062a \u0646\u06c1\u06cc\u06ba \u06c1\u06d2\u06d4<\/p>\n<p>\u0622\u062e\u0631 \u0645\u06cc\u06ba\u060c \u0622\u067e <strong>\u0627\u0631\u062f\u0648 \u0627\u06cc\u0644 \u0627\u06cc\u0644 \u0627\u06cc\u0645 \u0686\u06cc\u0679 \u0628\u0648\u0679 \u0646\u0648\u06a9\u0631\u06cc\u0627\u06ba<\/strong> \u062a\u0639\u06cc\u0646\u0627\u062a \u0627\u0648\u0631 \u0686\u0644 \u0631\u06c1\u0627 \u06c1\u06d2. \u0630\u06cc\u0644 \u0645\u06cc\u06ba \u0628\u06cc\u0627\u0646 \u06a9\u0631\u062f\u06c1 \u0645\u0631\u0627\u062d\u0644 \u067e\u0631 \u0639\u0645\u0644 \u06a9\u0631\u06a9\u06d2 \u0622\u067e \u0627\u0633\u06d2 \u0627\u067e\u0646\u06cc \u0632\u0628\u0627\u0646 \u0645\u06cc\u06ba \u0628\u06be\u06cc \u0628\u0646\u0627 \u0633\u06a9\u062a\u06d2 \u06c1\u06cc\u06ba\u06d4<\/p>\n<h3 id=\"heading-a-note-on-expectations\">\u062a\u0648\u0642\u0639\u0627\u062a \u067e\u0631 \u0646\u0648\u0679\u0633:<\/h3>\n<p>\u06cc\u06c1\u0627\u06ba \u06a9\u0627 \u0645\u0642\u0635\u062f \u0627\u067e\u0646\u06d2 \u0622\u067e \u06a9\u0648 \u062a\u0639\u0644\u06cc\u0645 \u062f\u06cc\u0646\u0627 \u06c1\u06d2 \u06a9\u06c1 LLM \u062a\u0645\u0627\u0645 \u0645\u0631\u0627\u062d\u0644 \u0633\u06d2 \u06af\u0632\u0631 \u06a9\u0631 \u06a9\u06cc\u0633\u06d2 \u06a9\u0627\u0645 \u06a9\u0631\u062a\u0627 \u06c1\u06d2\u06d4<\/p>\n<p>\u0645\u0642\u0635\u062f \u06cc\u06c1 \u06c1\u06d2\u06d4 <strong>~ \u0646\u06c1\u06cc\u06ba<\/strong> LLM ChatGPT \u06a9\u06cc \u0637\u0631\u062d \u06a9\u0627\u0645 \u06a9\u0631\u062a\u0627 \u06c1\u06d2\u06d4 \u0627\u0633 \u0645\u06cc\u06ba \u06a9\u0626\u06cc \u0631\u06a9\u0627\u0648\u0679\u06cc\u06ba \u06c1\u06cc\u06ba \u062c\u06cc\u0633\u06d2 \u0628\u0691\u06d2 \u0688\u06cc\u0679\u0627 \u0633\u06cc\u0679\u0633\u060c \u0645\u06c1\u06cc\u0646\u0648\u06ba \u06a9\u06cc \u062a\u0631\u0628\u06cc\u062a\u060c \u0627\u0648\u0631 \u0627\u0646\u0633\u0627\u0646\u06cc \u062a\u0627\u062b\u0631\u0627\u062a \u06a9\u06d2 \u0633\u0627\u062a\u06be \u06a9\u0645\u06a9 \u0633\u06cc\u06a9\u06be\u0646\u0627 (RLHF)\u060c \u0627\u0646 \u0633\u0628 \u06a9\u0648 \u0627\u0633 \u0679\u06cc\u0648\u0679\u0648\u0631\u06cc\u0644 \u06a9\u06d2 \u0630\u0631\u06cc\u0639\u06d2 \u0628\u06c1\u062a\u0631 \u0637\u0648\u0631 \u067e\u0631 \u0633\u0645\u062c\u06be\u0627 \u062c\u0627 \u0633\u06a9\u062a\u0627 \u06c1\u06d2\u06d4<\/p>\n<h3 id=\"heading-a-note-on-the-code\">\u06a9\u0648\u0688 \u06a9\u06d2 \u0628\u0627\u0631\u06d2 \u0645\u06cc\u06ba \u0646\u0648\u0679\u0633:<\/h3>\n<p>\u0627\u0633 \u0679\u06cc\u0648\u0679\u0648\u0631\u06cc\u0644 \u06a9\u0627 \u06a9\u0648\u0688 \u0628\u0646\u06cc\u0627\u062f\u06cc \u0637\u0648\u0631 \u067e\u0631 Claude Opus 4 \u06a9\u0627 \u0627\u0633\u062a\u0639\u0645\u0627\u0644 \u06a9\u0631\u062a\u06d2 \u06c1\u0648\u0626\u06d2 \u062a\u06cc\u0627\u0631 \u06a9\u06cc\u0627 \u06af\u06cc\u0627 \u062a\u06be\u0627\u06d4 \u06cc\u06c1 \u0646\u0645\u0627\u06cc\u0627\u06ba \u06a9\u0631\u0646\u06d2 \u06a9\u06d2 \u0642\u0627\u0628\u0644 \u06c1\u06d2 \u06a9\u06cc\u0648\u0646\u06a9\u06c1 \u06cc\u06c1 \u0638\u0627\u06c1\u0631 \u06a9\u0631\u062a\u0627 \u06c1\u06d2 \u06a9\u06c1 LLM \u0635\u0631\u0641 \u0627\u06cc\u06a9 \u06a9\u0648\u0688\u0646\u06af \u0645\u062f\u062f\u06af\u0627\u0631 \u0646\u06c1\u06cc\u06ba \u06c1\u06d2 \u062c\u0648 \u0622\u067e \u06a9\u0648 \u062e\u0635\u0648\u0635\u06cc\u0627\u062a \u06a9\u0648 \u062a\u06cc\u0632\u06cc \u0633\u06d2 \u0641\u0631\u0627\u06c1\u0645 \u06a9\u0631\u0646\u06d2 \u0645\u06cc\u06ba \u0645\u062f\u062f \u06a9\u0631\u06d2 \u06af\u0627\u06d4 \u06cc\u06c1 \u0633\u06cc\u06a9\u06be\u0646\u06d2 \u06a9\u0627 \u0627\u06cc\u06a9 \u0637\u0627\u0642\u062a\u0648\u0631 \u0679\u0648\u0644 \u0628\u06be\u06cc \u06c1\u0648 \u0633\u06a9\u062a\u0627 \u06c1\u06d2\u06d4<\/p>\n<p>Claude \u06a9\u06d2 \u06c1\u0631 \u0627\u06cc\u06a9 \u062c\u0632\u0648 \u06a9\u06d2 \u0630\u0631\u06cc\u0639\u06d2 \u062a\u062e\u0644\u06cc\u0642\u060c \u0648\u0636\u0627\u062d\u062a \u0627\u0648\u0631 \u0627\u0639\u0627\u062f\u06c1 \u06a9\u0631\u0646\u06d2 \u0633\u06d2 \u0645\u062c\u06be\u06d2 \u0635\u0631\u0641 \u062f\u0633\u062a\u0627\u0648\u06cc\u0632\u0627\u062a \u06a9\u0648 \u067e\u0691\u06be\u0646\u06d2 \u06a9\u06d2 \u0628\u062c\u0627\u0626\u06d2 LLM \u0679\u0631\u06cc\u0646\u0646\u06af \u06a9\u06d2 \u0627\u0646\u062f\u0631\u0648\u0646\u06cc \u0645\u0639\u0627\u0645\u0644\u0627\u062a \u06a9\u06d2 \u0628\u0627\u0631\u06d2 \u0645\u06cc\u06ba \u0628\u06c1\u062a \u06af\u06c1\u0631\u0627 \u0633\u0645\u062c\u06be \u062d\u0627\u0635\u0644 \u06c1\u0648\u0626\u06cc\u06d4<\/p>\n<p>\u0627\u06af\u0631 \u0622\u067e \u067e\u06cc\u0631\u0648\u06cc \u06a9\u0631 \u0631\u06c1\u06d2 \u06c1\u06cc\u06ba\u060c \u062a\u0648 \u0645\u06cc\u06ba \u0622\u067e \u06a9\u0648 \u0628\u06be\u06cc \u0627\u06cc\u0633\u0627 \u06a9\u0631\u0646\u06d2 \u06a9\u06cc \u062a\u0631\u063a\u06cc\u0628 \u062f\u06cc\u062a\u0627 \u06c1\u0648\u06ba\u06d4 \u0633\u06cc\u06a9\u06be\u0646\u06d2 \u06a9\u06d2 \u0644\u06cc\u06d2 \u0627\u06cc\u0644 \u0627\u06cc\u0644 \u0627\u06cc\u0645 \u06a9\u0627 \u0627\u0633\u062a\u0639\u0645\u0627\u0644 \u06a9\u0631\u06cc\u06ba\u06d4<\/p>\n<h3 id=\"heading-what-well-cover\">\u06c1\u0645 \u06a9\u06cc\u0627 \u0627\u062d\u0627\u0637\u06c1 \u06a9\u0631\u06cc\u06ba \u06af\u06d2:<\/h3>\n<h2 id=\"heading-components-of-llm-training\">\u0627\u06cc\u0644 \u0627\u06cc\u0644 \u0627\u06cc\u0645 \u062a\u0639\u0644\u06cc\u0645 \u06a9\u06d2 \u0627\u062c\u0632\u0627\u0621<\/h2>\n<p>\u0627\u0633 \u0679\u06cc\u0648\u0679\u0648\u0631\u06cc\u0644 \u0645\u06cc\u06ba\u060c \u06c1\u0645 \u0628\u06c1\u062a\u0631 \u062a\u0641\u06c1\u06cc\u0645 \u06a9\u06d2 \u0644\u06cc\u06d2 \u06a9\u0648\u0688 \u06a9\u06cc \u0645\u062b\u0627\u0644\u0648\u06ba \u06a9\u06d2 \u0633\u0627\u062a\u06be \u0627\u06cc\u06a9 \u0627\u06cc\u06a9 \u06a9\u0631\u06a9\u06d2 \u062f\u0631\u062c \u0630\u06cc\u0644 \u0627\u062c\u0632\u0627\u0621 \u06a9\u0627 \u0627\u062d\u0627\u0637\u06c1 \u06a9\u0631\u06cc\u06ba \u06af\u06d2\u06d4<\/p>\n<ol>\n<li>\n<p>\u0688\u06cc\u0679\u0627 \u06a9\u06cc \u062a\u06cc\u0627\u0631\u06cc<\/p>\n<\/li>\n<li>\n<p>\u0679\u0648\u06a9\u0646\u0627\u0626\u0632\u06cc\u0634\u0646<\/p>\n<\/li>\n<li>\n<p>\u067e\u06cc\u0634\u06af\u06cc \u062a\u0631\u0628\u06cc\u062a<\/p>\n<\/li>\n<li>\n<p>\u0632\u06cc\u0631 \u0646\u06af\u0631\u0627\u0646\u06cc \u0641\u0627\u0626\u0646 \u0679\u06cc\u0648\u0646\u0646\u06af (SFT)<\/p>\n<\/li>\n<li>\n<p>\u062a\u0639\u06cc\u0646\u0627\u062a\u06cc<\/p>\n<\/li>\n<\/ol>\n<h3 id=\"heading-tech-stack-required\">\u0679\u06cc\u06a9\u0646\u0627\u0644\u0648\u062c\u06cc \u0627\u0633\u0679\u06cc\u06a9 \u062f\u0631\u06a9\u0627\u0631 \u06c1\u06d2\u06d4<\/h3>\n<p>\u0627\u0642\u062f\u0627\u0645\u0627\u062a \u0634\u0631\u0648\u0639 \u06a9\u0631\u0646\u06d2 \u0633\u06d2 \u067e\u06c1\u0644\u06d2\u060c \u06cc\u06c1 \u0679\u06cc\u06a9\u0646\u0627\u0644\u0648\u062c\u06cc \u0627\u0633\u0679\u06cc\u06a9 \u06c1\u06d2 \u062c\u0633 \u06a9\u06cc \u0622\u067e \u06a9\u0648 \u0636\u0631\u0648\u0631\u062a \u06c1\u0648\u06af\u06cc:<\/p>\n<ol>\n<li>\n<p>Python 3.9+<\/p>\n<\/li>\n<li>\n<p>\u067e\u0627\u0626\u06cc \u0679\u0627\u0631\u0686<\/p>\n<\/li>\n<li>\n<p>Tokenizer\/SentencePiece<\/p>\n<\/li>\n<li>\n<p>\u06c1\u06af\u0646\u06af \u0641\u06cc\u0633\u0633 \u0688\u06cc\u0679\u0627\u0633\u06cc\u0679 \u0627\u0648\u0631 \u062d\u0628<\/p>\n<\/li>\n<li>\n<p>\u0628\u0627\u0642\u0627\u0639\u062f\u06c1 \u0627\u0638\u06c1\u0627\u0631\u060c \u062e\u0648\u0628\u0635\u0648\u0631\u062a \u0633\u0648\u067e4\u060c \u062f\u0631\u062e\u0648\u0627\u0633\u062a\u06cc\u06ba (\u0688\u06cc\u0679\u0627 \u06a9\u06cc \u0635\u0641\u0627\u0626\u06cc \u06a9\u06d2 \u0644\u06cc\u06d2)<\/p>\n<\/li>\n<li>\n<p>tqdm\u060c matplotlib (\u062a\u0631\u0628\u06cc\u062a\u06cc \u0627\u0641\u0627\u062f\u06cc\u062a \u06a9\u06d2 \u0644\u06cc\u06d2)<\/p>\n<\/li>\n<li>\n<p>\u06af\u0631\u06cc\u0688\u06cc\u0648 (\u0686\u06cc\u0679 UI \u062a\u0639\u06cc\u0646\u0627\u062a\u06cc \u06a9\u06d2 \u0644\u06cc\u06d2)<\/p>\n<\/li>\n<li>\n<p>Google Colab (\u062a\u0631\u0628\u06cc\u062a \u06a9\u06d2 \u0644\u06cc\u06d2 \u0645\u0641\u062a T4 GPU)<\/p>\n<\/li>\n<\/ol>\n<p><strong>\u0645\u06cc\u0645\u0648:<\/strong> \u06cc\u0642\u06cc\u0646\u06cc \u0628\u0646\u0627\u0626\u06cc\u06ba \u06a9\u06c1 \u0622\u067e \u0646\u06d2 \u0630\u06cc\u0644 \u0645\u06cc\u06ba \u062f\u0631\u062c \u062a\u0645\u0627\u0645 \u0627\u0646\u062d\u0635\u0627\u0631\u0627\u062a \u06a9\u0648 \u0627\u0646\u0633\u0679\u0627\u0644 \u06a9\u0631 \u0644\u06cc\u0627 \u06c1\u06d2: <code>requirements.txt<\/code> \u0634\u0631\u0648\u0639 \u06a9\u0631\u0646\u06d2 \u0633\u06d2 \u067e\u06c1\u0644\u06d2\u060c \u0627\u067e\u0646\u06d2 \u0630\u062e\u06cc\u0631\u06c1 \u0645\u06cc\u06ba \u0645\u0648\u062c\u0648\u062f \u0641\u0627\u0626\u0644\u0648\u06ba \u06a9\u0648 \u0686\u06cc\u06a9 \u06a9\u0631\u06cc\u06ba\u06d4<\/p>\n<h2 id=\"heading-1-data-preparation\">1. \u0688\u06cc\u0679\u0627 \u06a9\u06cc \u062a\u06cc\u0627\u0631\u06cc<\/h2>\n<p>\u0688\u06cc\u0679\u0627 \u06a9\u06cc \u062a\u06cc\u0627\u0631\u06cc \u0645\u06cc\u06ba \u0633\u0628 \u0633\u06d2 \u067e\u06c1\u0644\u06d2 \u06a9\u0627\u0645 \u06a9\u0631\u0646\u0627 \u06c1\u06d2\u06d4 <strong>\u0688\u06cc\u0679\u0627 \u0627\u06a9\u0679\u06be\u0627 \u06a9\u0631\u0646\u0627<\/strong>. LLMs \u06a9\u0648 \u0679\u06cc\u06a9\u0633\u0679 \u0688\u06cc\u0679\u0627 \u06a9\u06cc \u0628\u0691\u06cc \u0645\u0642\u062f\u0627\u0631 \u067e\u0631 \u062a\u0631\u0628\u06cc\u062a \u06a9\u06cc \u0636\u0631\u0648\u0631\u062a \u06c1\u0648\u062a\u06cc \u06c1\u06d2\u06d4 \u06cc\u06c1 \u0688\u06cc\u0679\u0627 \u062d\u0627\u0635\u0644 \u06a9\u0631\u0646\u06d2 \u06a9\u06d2 \u0644\u06cc\u06d2 \u06a9\u0648\u0626\u06cc \u0627\u06cc\u06a9 \u062c\u06af\u06c1 \u0646\u06c1\u06cc\u06ba \u06c1\u06d2\u06d4 \u0622\u067e \u062c\u0633 \u0645\u0627\u0688\u0644 \u06a9\u0648 \u0628\u0646\u0627\u0646\u0627 \u0686\u0627\u06c1\u062a\u06d2 \u06c1\u06cc\u06ba \u0627\u0633 \u067e\u0631 \u0645\u0646\u062d\u0635\u0631 \u06c1\u06d2\u060c \u0622\u067e \u0645\u062a\u0646\u0648\u0639 \u0630\u0631\u0627\u0626\u0639 \u0633\u06d2 \u0645\u062a\u0646 \u062c\u0645\u0639 \u06a9\u0631 \u0633\u06a9\u062a\u06d2 \u06c1\u06cc\u06ba\u06d4<\/p>\n<ul>\n<li>\n<p><strong>\u0688\u06cc\u062c\u06cc\u0679\u0644 \u0644\u0627\u0626\u0628\u0631\u06cc\u0631\u06cc\u0627\u06ba \u0627\u0648\u0631 \u0622\u0631\u06a9\u0627\u0626\u06cc\u0648\u0632:<\/strong> \u0627\u0646\u0679\u0631\u0646\u06cc\u0679 \u0622\u0631\u06a9\u0627\u0626\u06cc\u0648 \u06cc\u0627 \u0648\u06cc\u06a9\u06cc\u067e\u06cc\u0688\u06cc\u0627 \u0688\u0645\u067e<\/p>\n<\/li>\n<li>\n<p><strong>\u06a9\u0648\u0688 \u0630\u062e\u06cc\u0631\u06c1:<\/strong> GitHub\u060c GitLab (\u0627\u06af\u0631 \u0622\u067e \u06a9\u06d2 \u0645\u0627\u0688\u0644 \u06a9\u0648 \u06a9\u0648\u0688 \u06a9\u0648 \u0633\u0645\u062c\u06be\u0646\u06d2 \u06a9\u06cc \u0636\u0631\u0648\u0631\u062a \u06c1\u0648 \u062a\u0648 \u0645\u0641\u06cc\u062f \u06c1\u06d2)<\/p>\n<\/li>\n<li>\n<p><strong>\u0648\u06cc\u0628 \u0633\u06a9\u0631\u06cc\u067e\u0646\u06af:<\/strong> \u062e\u0648\u062f\u06a9\u0627\u0631 \u0627\u0633\u06a9\u0631\u067e\u0679\u0633 \u06a9\u0627 \u0627\u0633\u062a\u0639\u0645\u0627\u0644 \u06a9\u0631\u062a\u06d2 \u06c1\u0648\u0626\u06d2 \u0648\u06cc\u0628 \u0633\u0627\u0626\u0679\u0633\u060c \u0628\u0644\u0627\u06af\u0632 \u0627\u0648\u0631 \u0641\u0648\u0631\u0645\u0632 \u06a9\u0648 \u06a9\u0631\u0627\u0644 \u06a9\u0631\u06cc\u06ba\u06d4<\/p>\n<\/li>\n<li>\n<p><strong>\u062a\u0639\u0644\u06cc\u0645\u06cc \u0688\u06cc\u0679\u0627\u0633\u06cc\u0679\u0633:<\/strong> \u062a\u062d\u0642\u06cc\u0642\u06cc \u0645\u0642\u0627\u0644\u06d2\u060c \u0627\u0648\u067e\u0646 \u0631\u0633\u0627\u0626\u06cc \u062c\u0631\u0646\u0644\u0632<\/p>\n<\/li>\n<li>\n<p><strong>\u067e\u06c1\u0644\u06d2 \u0633\u06d2 \u062a\u06cc\u0627\u0631 \u06a9\u0631\u062f\u06c1 \u0688\u06cc\u0679\u0627\u0633\u06cc\u0679\u0633:<\/strong> Hugging Face Datasets \u0627\u0648\u0631 Kaggle \u062c\u06cc\u0633\u06d2 \u067e\u0644\u06cc\u0679 \u0641\u0627\u0631\u0645 \u06c1\u0632\u0627\u0631\u0648\u06ba \u0627\u0633\u062a\u0639\u0645\u0627\u0644 \u06a9\u06d2 \u0644\u06cc\u06d2 \u062a\u06cc\u0627\u0631 \u0688\u06cc\u0679\u0627 \u0633\u06cc\u0679\u0633 \u06a9\u06cc \u0645\u06cc\u0632\u0628\u0627\u0646\u06cc \u06a9\u0631\u062a\u06d2 \u06c1\u06cc\u06ba\u06d4<\/p>\n<\/li>\n<\/ul>\n<p>\u062f\u0631\u062d\u0642\u06cc\u0642\u062a\u060c \u0628\u0691\u06d2 \u067e\u06cc\u0645\u0627\u0646\u06d2 \u067e\u0631 LLMs \u062c\u06cc\u0633\u06d2 GPT \u0627\u0648\u0631 LLaMA \u062e\u0648\u062f\u06a9\u0627\u0631 \u067e\u0627\u0626\u067e \u0644\u0627\u0626\u0646\u0648\u06ba \u06a9\u0627 \u0627\u0633\u062a\u0639\u0645\u0627\u0644 \u06a9\u0631\u062a\u06d2 \u06c1\u0648\u0626\u06d2 \u0645\u062e\u062a\u0644\u0641 \u0630\u0631\u0627\u0626\u0639 \u0633\u06d2 \u0648\u06cc\u0628 \u0633\u06a9\u0631\u06cc\u067e\u0646\u06af \u067e\u0631 \u0628\u06c1\u062a \u0632\u06cc\u0627\u062f\u06c1 \u0627\u0646\u062d\u0635\u0627\u0631 \u06a9\u0631\u062a\u06d2 \u06c1\u06cc\u06ba\u06d4 \u062a\u0627\u06c1\u0645\u060c \u067e\u06cc\u0631\u0648\u06cc \u06a9\u0631\u0646\u06d2 \u06a9\u06d2 \u0644\u0626\u06d2 \u0627\u06cc\u06a9 \u0627\u06c1\u0645 \u0627\u0635\u0648\u0644 \u06c1\u06d2. <strong>\u0635\u0631\u0641 \u0639\u0648\u0627\u0645\u06cc \u0637\u0648\u0631 \u067e\u0631 \u062f\u0633\u062a\u06cc\u0627\u0628\u060c \u0627\u0648\u067e\u0646 \u0633\u0648\u0631\u0633 \u0688\u06cc\u0679\u0627 \u0627\u0633\u062a\u0639\u0645\u0627\u0644 \u06a9\u0631\u06cc\u06ba\u06d4<\/strong> \u0630\u0627\u062a\u06cc \u0645\u0639\u0644\u0648\u0645\u0627\u062a \u06cc\u0627 \u0646\u062c\u06cc \u0635\u0627\u0631\u0641 \u06a9\u06cc \u0645\u0639\u0644\u0648\u0645\u0627\u062a \u06a9\u0648 \u0646\u06c1 \u06a9\u06be\u0631\u0686\u06cc\u06ba\u06d4 \u0627\u0633 \u0688\u06cc\u0679\u0627 \u067e\u0631 \u0642\u0627\u0626\u0645 \u0631\u06c1\u06cc\u06ba \u062c\u0648 \u0639\u0648\u0627\u0645\u06cc \u0627\u0633\u062a\u0639\u0645\u0627\u0644 \u06a9\u06d2 \u0644\u06cc\u06d2 \u0648\u0627\u0636\u062d \u0637\u0648\u0631 \u067e\u0631 \u0634\u06cc\u0626\u0631 \u06a9\u06cc\u0627 \u06af\u06cc\u0627 \u06c1\u0648 \u06cc\u0627 \u062c\u0648 \u0627\u062c\u0627\u0632\u062a \u0646\u0627\u0645\u06c1 \u06a9\u06d2 \u062a\u062d\u062a \u0622\u062a\u0627 \u06c1\u0648\u06d4<\/p>\n<p><strong>\u0628\u06be\u06cc\u060c<\/strong> \u0627\u0646 \u0627\u0635\u0648\u0644\u0648\u06ba \u06a9\u0648 \u0630\u06c1\u0646 \u0645\u06cc\u06ba \u0631\u06a9\u06be\u06cc\u06ba: <strong>\u06a9\u0686\u0631\u0627 \u0627\u0646\u062f\u0631\u060c \u06a9\u0686\u0631\u0627 \u0628\u0627\u06c1\u0631<\/strong>. \u0635\u0631\u0641 \u0688\u06cc\u0679\u0627 \u062d\u0627\u0635\u0644 \u06a9\u0631\u0646\u0627 \u06a9\u0627\u0641\u06cc \u0646\u06c1\u06cc\u06ba \u06c1\u06d2\u06d4 \u06cc\u06c1 \u062f\u0631\u0633\u062a\u060c \u0635\u0627\u0641 \u0627\u0648\u0631 \u0634\u0648\u0631 \u0633\u06d2 \u067e\u0627\u06a9 \u06c1\u0648\u0646\u0627 \u0686\u0627\u06c1\u06cc\u06d2\u06d4<\/p>\n<p>\u0639\u0645\u0644\u06cc \u0637\u0648\u0631 \u067e\u0631\u060c \u0688\u06cc\u0679\u0627 \u0645\u062e\u062a\u0644\u0641 \u0630\u0631\u0627\u0626\u0639 \u0633\u06d2 \u062c\u0645\u0639 \u06a9\u06cc\u0627 \u062c\u0627 \u0633\u06a9\u062a\u0627 \u06c1\u06d2. \u0645\u06cc\u0631\u06d2 \u0645\u0639\u0627\u0645\u0644\u06d2 \u0645\u06cc\u06ba \u0645\u062c\u06be\u06d2 \u06a9\u0627\u0641\u06cc \u0688\u06cc\u0679\u0627 \u0645\u0644\u0627 \u06c1\u06d2: <strong>\u06af\u0644\u06d2 \u0644\u06af\u0646\u06d2 \u0648\u0627\u0644\u0627 \u0686\u06c1\u0631\u06c1<\/strong>. \u06af\u0644\u06d2 \u0645\u0644\u062a\u06d2 \u0686\u06c1\u0631\u06d2 \u067e\u0631 <strong>\u06a9\u0644\u0686\u0631 \u0627\u06cc\u06a9\u0633<\/strong> \u0645\u06cc\u0631\u06d2 \u067e\u0627\u0633 \u06a9\u062b\u06cc\u0631 \u0644\u0633\u0627\u0646\u06cc \u0688\u06cc\u0679\u0627\u0633\u06cc\u0679 \u06c1\u06d2\u06d4 \u0688\u06cc\u0679\u0627 \u0633\u06cc\u0679 \u0628\u06c1\u062a \u0628\u0691\u0627 \u062a\u06be\u0627\u060c \u0644\u06c1\u0630\u0627 \u0645\u06cc\u06ba \u0646\u06d2 \u067e\u0648\u0631\u06cc \u0686\u06cc\u0632 \u06a9\u06d2 \u0628\u062c\u0627\u0626\u06d2 \u0627\u0633 \u06a9\u0627 \u0635\u0631\u0641 \u0627\u06cc\u06a9 \u062d\u0635\u06c1 \u0688\u0627\u0624\u0646 \u0644\u0648\u0688 \u06a9\u06cc\u0627\u06d4<\/p>\n<p>\u0627\u0633 \u0679\u06cc\u0648\u0679\u0648\u0631\u06cc\u0644 \u0645\u06cc\u06ba <strong>\u06af\u0644\u06d2 \u0644\u06af\u0646\u06d2 \u0648\u0627\u0644\u0627 \u0686\u06c1\u0631\u06c1<\/strong> \u0645\u06cc\u0631\u06d2 \u0688\u06cc\u0679\u0627 \u0633\u0648\u0631\u0633 \u062a\u06a9\u06d4 \u0645\u06cc\u06ba \u0646\u06d2 \u0627\u0633\u06d2 \u0686\u0646\u062f \u0648\u062c\u0648\u06c1\u0627\u062a \u06a9\u06cc \u0628\u0646\u0627 \u067e\u0631 \u0645\u0646\u062a\u062e\u0628 \u06a9\u06cc\u0627\u06d4<\/p>\n<p>\u0633\u0628 \u0633\u06d2 \u067e\u06c1\u0644\u06d2\u060c \u0686\u0648\u0646\u06a9\u06c1 \u0645\u06cc\u0631\u0627 \u0645\u0642\u0635\u062f \u06cc\u06c1 \u062c\u0627\u0646\u0646\u0627 \u062a\u06be\u0627 \u06a9\u06c1 \u0627\u06cc\u0644 \u0627\u06cc\u0644 \u0627\u06cc\u0645 \u06a9\u06cc\u0633\u06d2 \u06a9\u0627\u0645 \u06a9\u0631\u062a\u0627 \u06c1\u06d2\u060c \u0645\u06cc\u06ba \u0627\u067e\u0646\u0627 \u0648\u0642\u062a \u0648\u06cc\u0628 \u0633\u06a9\u0631\u06cc\u067e\u0631 \u0644\u06a9\u06be\u0646\u06d2 \u06a9\u06d2 \u0628\u062c\u0627\u0626\u06d2 \u0645\u0627\u0688\u0644\u0632 \u067e\u0631 \u06af\u0632\u0627\u0631\u0646\u0627 \u0686\u0627\u06c1\u062a\u0627 \u062a\u06be\u0627\u06d4 Hugging Face \u067e\u06c1\u0644\u06d2 \u0633\u06d2 \u06c1\u06cc \u0627\u06cc\u06a9 \u0635\u0627\u0641 \u0633\u062a\u06be\u0631\u0627\u060c \u0633\u0679\u0631\u06a9\u0686\u0631\u0688 \u0641\u0627\u0631\u0645\u06cc\u0679 \u0645\u06cc\u06ba \u0688\u06cc\u0679\u0627 \u0633\u06cc\u0679\u0633 \u06a9\u0627 \u0627\u06cc\u06a9 \u0628\u0691\u0627 \u0645\u062c\u0645\u0648\u0639\u06c1 \u06c1\u06d2\u060c \u062c\u0633 \u0633\u06d2 \u0622\u067e \u06a9\u0648 \u0627\u0628\u062a\u062f\u0627\u0626\u06cc \u06a9\u0627\u0645 \u06a9\u06cc \u06a9\u0627\u0641\u06cc \u0628\u0686\u062a \u06c1\u0648\u062a\u06cc \u06c1\u06d2\u06d4<\/p>\n<p>\u062f\u0648\u0633\u0631\u0627\u060c \u06c1\u06af\u0646\u06af \u0641\u06cc\u0633 \u0632\u0628\u0627\u0646 \u0633\u06d2 \u0645\u062a\u0639\u0644\u0642 \u0688\u06cc\u0679\u0627 \u0633\u06cc\u0679 \u0641\u0631\u0627\u06c1\u0645 \u06a9\u0631\u062a\u0627 \u06c1\u06d2\u06d4 \u0645\u06cc\u06ba \u0627\u0631\u062f\u0648 \u0627\u06cc\u0644 \u0627\u06cc\u0644 \u0627\u06cc\u0645 \u0628\u0646\u0627 \u0631\u06c1\u0627 \u062a\u06be\u0627 \u0627\u0633 \u0644\u06cc\u06d2 \u0645\u062c\u06be\u06d2 \u062e\u0627\u0635 \u0637\u0648\u0631 \u067e\u0631 \u0627\u0631\u062f\u0648 \u0679\u06cc\u06a9\u0633\u0679 \u0627\u0648\u0631 \u06c1\u06cc\u06af\u0646\u06af \u0641\u06cc\u0633 \u06a9\u06cc \u0636\u0631\u0648\u0631\u062a \u062a\u06be\u06cc\u06d4 <strong>\u06a9\u0644\u0686\u0631 \u0627\u06cc\u06a9\u0633<\/strong> \u06cc\u06c1 \u0627\u0631\u062f\u0648 \u0627\u0648\u0631 \u0628\u06c1\u062a \u0633\u06cc \u062f\u0648\u0633\u0631\u06cc \u0632\u0628\u0627\u0646\u0648\u06ba \u0633\u0645\u06cc\u062a \u06a9\u062b\u06cc\u0631 \u0644\u0633\u0627\u0646\u06cc \u0688\u06cc\u0679\u0627 \u0633\u06cc\u0679\u0633 \u067e\u06cc\u0634 \u06a9\u0631\u062a\u0627 \u06c1\u06d2\u06d4 \u0688\u06cc\u0679\u0627\u0633\u06cc\u0679 \u0628\u06c1\u062a \u0628\u0691\u0627 \u062a\u06be\u0627\u060c \u0627\u0633 \u0644\u06cc\u06d2 \u0645\u06cc\u06ba \u0646\u06d2 \u06cc\u06c1 \u0633\u0628 \u0688\u0627\u0624\u0646 \u0644\u0648\u0688 \u0646\u06c1\u06cc\u06ba \u06a9\u06cc\u0627\u060c \u0644\u06cc\u06a9\u0646 \u0627\u0633 \u06a9\u0627 \u0635\u0631\u0641 \u0627\u06cc\u06a9 \u062d\u0635\u06c1\u06d4<\/p>\n<p><strong>\u0627\u06c1\u0645:<\/strong> \u0627\u0633 \u0633\u06d2 \u067e\u06c1\u0644\u06d2 \u06a9\u06c1 \u0622\u067e Hugging Face \u0633\u06d2 \u0688\u06cc\u0679\u0627 \u0633\u06cc\u0679 \u0688\u0627\u0624\u0646 \u0644\u0648\u0688 \u06a9\u0631\u0646\u0627 \u0634\u0631\u0648\u0639 \u06a9\u0631 \u0633\u06a9\u06cc\u06ba\u060c \u0622\u067e \u06a9\u0648 \u0627\u06cc\u06a9 \u0627\u06a9\u0627\u0624\u0646\u0679 \u0628\u0646\u0627\u0646\u0627 \u06c1\u0648\u06af\u0627\u06d4 \u0627\u0633 \u06a9\u06d2 \u0628\u0639\u062f \u0622\u067e CLI \u0645\u06cc\u06ba \u0644\u0627\u06af \u0627\u0646 \u06a9\u0631\u06a9\u06d2 \u0688\u06cc\u0679\u0627\u0633\u06cc\u0679 \u0688\u0627\u0624\u0646 \u0644\u0648\u0688 \u06a9\u0631\u0633\u06a9\u062a\u06d2 \u06c1\u06cc\u06ba\u06d4<\/p>\n<p>\u0646\u06cc\u0686\u06d2 \u062f\u06cc \u06af\u0626\u06cc \u0627\u0633\u06a9\u0631\u067e\u0679 Hugging Face \u0688\u06cc\u0679\u0627\u0633\u06cc\u0679 \u06a9\u0648 \u0644\u0648\u0688 \u06a9\u0631\u062a\u06cc \u06c1\u06d2 \u0627\u0648\u0631 \u0633\u0644\u0633\u0644\u06c1 \u0628\u0646\u062f\u06cc \u0634\u0631\u0648\u0639 \u06a9\u0631 \u062f\u06cc\u062a\u06cc \u06c1\u06d2\u06d4 <code>True<\/code>. \u0627\u06cc\u0633\u0627 \u06a9\u0631\u0646\u06d2 \u06a9\u0627 \u0645\u0642\u0635\u062f \u06cc\u06c1 \u06c1\u06d2 \u06a9\u06c1 \u062a\u0645\u0627\u0645 \u0627\u0639\u062f\u0627\u062f \u0648 \u0634\u0645\u0627\u0631 \u06a9\u0648 \u0688\u0627\u0624\u0646 \u0644\u0648\u0688 \u06a9\u0631\u0646\u06d2 \u06a9\u06cc \u0636\u0631\u0648\u0631\u062a \u0646\u06c1\u06cc\u06ba \u06c1\u06d2\u060c \u0635\u0631\u0641 \u0646\u0645\u0648\u0646\u06d2 \u06a9\u06d2 \u062d\u0635\u06d2 \u0630\u06cc\u0644 \u0645\u06cc\u06ba \u0628\u06cc\u0627\u0646 \u06a9\u06cc\u06d2 \u06af\u0626\u06d2 \u06c1\u06cc\u06ba\u06d4 <code>NUM_SAMPLES<\/code>.<\/p>\n<pre><code class=\"language-python\"># ============================================================\n# Option A: Download from CulturaX (recommended, high quality)\n# ============================================================\n# CulturaX is a cleaned version of mC4 + OSCAR\n# We stream it to avoid downloading the entire dataset\n\nNUM_SAMPLES = 100_000  # Start with 100K samples (~50-100MB text)\n\nprint(\"Loading CulturaX Urdu dataset (streaming)...\")\ndataset = load_dataset(\n    \"uonlp\/CulturaX\",\n    \"ur\",                    # Urdu language code\n    split=\"train\",\n    streaming=True,          # Don't download everything\n    trust_remote_code=True\n)\n\n# Collect samples\nraw_texts = []\nfor i, sample in enumerate(tqdm(dataset, total=NUM_SAMPLES, desc=\"Downloading\")):\n    if i >= NUM_SAMPLES:\n        break\n    raw_texts.append(sample[\"text\"])\n\nprint(f\"\\nDownloaded {len(raw_texts)} samples\")\nprint(f\"Total characters: {sum(len\nprint(f\"\\nSample text (first 500 chars):\")\nprint(raw_texts[0][:500])\n<\/code><\/pre>\n<h3 id=\"heading-data-cleaning\">\u0688\u06cc\u0679\u0627 \u06a9\u06cc \u0635\u0641\u0627\u0626\u06cc<\/h3>\n<p>\u0645\u0627\u0688\u0644 \u06a9\u06cc \u062a\u0631\u0628\u06cc\u062a \u0634\u0631\u0648\u0639 \u06a9\u0631\u0646\u06d2 \u06a9\u06d2 \u0644\u06cc\u06d2 \u0635\u0631\u0641 \u0688\u06cc\u0679\u0627 \u06c1\u0648\u0646\u0627 \u06a9\u0627\u0641\u06cc \u0646\u06c1\u06cc\u06ba \u06c1\u06d2\u06d4 \u0627\u06af\u0644\u0627 \u0645\u0631\u062d\u0644\u06c1 \u0634\u0627\u06cc\u062f \u0633\u0628 \u0633\u06d2 \u0627\u06c1\u0645 \u06c1\u06d2\u06d4 <strong>\u0688\u06cc\u0679\u0627 \u06a9\u06cc \u0635\u0641\u0627\u0626\u06cc<\/strong>. \u0645\u0642\u0635\u062f \u0688\u06cc\u0679\u0627 \u06a9\u0648 \u06c1\u0631 \u0645\u0645\u06a9\u0646 \u062d\u062f \u062a\u06a9 \u062e\u0627\u0644\u0635 \u0628\u0646\u0627\u0646\u0627 \u06c1\u06d2\u06d4<\/p>\n<p>\u0632\u0628\u0627\u0646 \u06a9\u06d2 \u0644\u06cc\u06d2 \u0645\u062e\u0635\u0648\u0635 \u0627\u0631\u062f\u0648 \u0627\u06cc\u0644 \u0627\u06cc\u0644 \u0627\u06cc\u0645 \u0628\u0646\u0627\u062a\u06d2 \u0648\u0642\u062a\u060c \u0645\u062c\u06be\u06d2 \u063a\u06cc\u0631 \u0627\u0631\u062f\u0648 \u0645\u062a\u0646\u060c \u0627\u06cc\u0686 \u0679\u06cc \u0627\u06cc\u0645 \u0627\u06cc\u0644 \u0644\u0646\u06a9\u0633\u060c \u062e\u0635\u0648\u0635\u06cc \u062d\u0631\u0648\u0641\u060c \u0688\u067e\u0644\u06cc\u06a9\u06cc\u0679 \u0645\u0648\u0627\u062f\u060c \u0627\u0648\u0631 \u0636\u0631\u0648\u0631\u062a \u0633\u06d2 \u0632\u06cc\u0627\u062f\u06c1 \u062e\u0627\u0644\u06cc \u062c\u06af\u06c1 \u06a9\u0648 \u06c1\u0679\u0627\u0646\u06d2 \u06a9\u06d2 \u0644\u06cc\u06d2 \u06a9\u0644\u06cc\u0646 \u0627\u067e \u0645\u0646\u0637\u0642 \u0644\u06a9\u06be\u0646\u0627 \u067e\u0691\u06cc\u06d4 \u06cc\u06c1 \u062a\u0645\u0627\u0645 \u0639\u0648\u0627\u0645\u0644 \u062a\u0631\u0628\u06cc\u062a\u06cc \u0688\u06cc\u0679\u0627 \u06a9\u0648 \u0622\u0644\u0648\u062f\u06c1 \u06a9\u0631 \u0633\u06a9\u062a\u06d2 \u06c1\u06cc\u06ba \u0627\u0648\u0631 \u062a\u0631\u0628\u06cc\u062a \u06a9\u06d2 \u062f\u0648\u0631\u0627\u0646 \u0645\u0633\u0627\u0626\u0644 \u067e\u06cc\u062f\u0627 \u06a9\u0631 \u0633\u06a9\u062a\u06d2 \u06c1\u06cc\u06ba\u06d4<\/p>\n<p>\u0688\u06cc\u0679\u0627 \u0633\u06cc\u0679 \u06a9\u06cc \u0642\u0633\u0645 \u067e\u0631 \u0645\u0646\u062d\u0635\u0631 \u06c1\u06d2\u060c \u06a9\u0686\u06be \u0645\u062e\u0635\u0648\u0635 \u0632\u0628\u0627\u0646 \u06cc\u0627 \u0627\u0633\u062a\u0639\u0645\u0627\u0644 \u06a9\u06d2 \u0645\u0639\u0627\u0645\u0644\u06d2 \u06a9\u06cc \u062a\u0646\u0638\u06cc\u0645 \u06a9\u06cc \u0636\u0631\u0648\u0631\u062a \u06c1\u0648\u06af\u06cc\u06d4<\/p>\n<p>\u0622\u067e \u06a9\u06d2 \u0644\u06cc\u06d2 \u0646\u0626\u06cc \u0686\u06cc\u0632\u0648\u06ba \u0645\u06cc\u06ba \u0633\u06d2 \u0627\u06cc\u06a9 <strong>NFKC \u06cc\u0648\u0646\u06cc\u06a9\u0648\u0688 \u0646\u0627\u0631\u0645\u0644\u0627\u0626\u0632\u06cc\u0634\u0646<\/strong> \u0642\u062f\u0645 \u06cc\u06c1 \u0645\u062a\u0646 \u06a9\u0648 \u0645\u0639\u0645\u0648\u0644 \u0628\u0646\u0627\u062a\u0627 \u06c1\u06d2 \u062c\u0648 \u0627\u06cc\u06a9 \u062c\u06cc\u0633\u0627 \u0646\u0638\u0631 \u0622\u062a\u0627 \u06c1\u06d2 \u0644\u06cc\u06a9\u0646 \u0627\u06cc\u06a9 \u0645\u0639\u06cc\u0627\u0631\u06cc \u0641\u0627\u0631\u0645\u06cc\u0679 \u06a9\u0648 \u0628\u0631\u0642\u0631\u0627\u0631 \u0631\u06a9\u06be\u0646\u06d2 \u06a9\u06d2 \u0644\u06cc\u06d2 \u0645\u062e\u062a\u0644\u0641 \u06cc\u0648\u0646\u06cc\u06a9\u0648\u0688 \u0641\u0627\u0631\u0645\u06cc\u0679\u0633 \u0645\u06cc\u06ba \u0645\u0648\u062c\u0648\u062f \u06c1\u06d2\u06d4<\/p>\n<p>\u0622\u067e \u06a9\u0686\u06be \u0631\u06cc\u06af\u0648\u0644\u0631 \u0627\u06cc\u06a9\u0633\u067e\u0631\u06cc\u0634\u0646 \u067e\u06cc\u0679\u0631\u0646 \u0628\u06be\u06cc \u062f\u06cc\u06a9\u06be \u0633\u06a9\u062a\u06d2 \u06c1\u06cc\u06ba \u062c\u0648 \u0635\u0631\u0641 \u0627\u0631\u062f\u0648 \u0679\u06cc\u06a9\u0633\u0679 \u0631\u06a9\u06be\u0646\u06d2 \u06a9\u06d2 \u0644\u06cc\u06d2 \u0627\u0633\u062a\u0639\u0645\u0627\u0644 \u06c1\u0648\u062a\u06d2 \u06c1\u06cc\u06ba\u06d4 \u0627\u0631\u062f\u0648 \u0631\u0633\u0645 \u0627\u0644\u062e\u0637 \u0639\u0631\u0628\u06cc \u067e\u0631 \u0645\u0628\u0646\u06cc \u06c1\u06d2 \u0627\u0633 \u0644\u06cc\u06d2 \u0639\u0631\u0628\u06cc \u06cc\u0648\u0646\u06cc\u06a9\u0648\u0688 \u0631\u06cc\u0646\u062c \u0627\u0633\u062a\u0639\u0645\u0627\u0644 \u06a9\u0631\u062a\u0627 \u06c1\u06d2\u06d4 \u0645\u06cc\u06ba \u0646\u06d2 \u062f\u0631\u062c \u0630\u06cc\u0644 \u0646\u0645\u0648\u0646\u06d2 \u0628\u06be\u06cc \u06c1\u0679\u0627 \u062f\u06cc\u06d2 \u06c1\u06cc\u06ba: <code>\/\/<\/code>, <code>--<\/code>\u0627\u0648\u0631 \u062e\u0627\u0645 \u0688\u06cc\u0679\u0627 \u0645\u06cc\u06ba \u0645\u0648\u062c\u0648\u062f \u0627\u0636\u0627\u0641\u06cc \u062e\u0627\u0644\u06cc \u062c\u06af\u06c1\u06d4<\/p>\n<p>\u0627\u0633 \u0635\u0641\u0627\u0626\u06cc \u06a9\u06d2 \u0644\u06cc\u06d2 \u06a9\u0626\u06cc \u062a\u06a9\u0631\u0627\u0631 \u06a9\u06cc \u0636\u0631\u0648\u0631\u062a \u062a\u06be\u06cc\u06d4 \u06c1\u0631 \u0628\u0627\u0631\u060c \u06c1\u0645 \u0646\u06d2 \u0646\u062a\u0627\u0626\u062c \u06a9\u0627 \u062f\u0633\u062a\u06cc \u0637\u0648\u0631 \u067e\u0631 \u062c\u0627\u0626\u0632\u06c1 \u0644\u06cc\u0627 \u0627\u0648\u0631 \u0645\u062a\u0636\u0627\u062f \u0648\u0642\u0641\u06c1 \u06a9\u0627\u0631\u06cc\u060c \u0644\u0645\u0628\u06cc \u0688\u06cc\u0634\u0632\u060c \u0627\u0648\u0631 \u063a\u0644\u0637 \u0627\u0648\u0642\u0627\u0641 \u062c\u06cc\u0633\u06d2 \u0645\u0633\u0627\u0626\u0644 \u06a9\u06cc \u0646\u0634\u0627\u0646\u062f\u06c1\u06cc \u06a9\u06cc\u06d4 \u06cc\u06c1 \u0633\u0628 \u0627\u06af\u0644\u06d2 \u0645\u0631\u0627\u062d\u0644 \u067e\u0631 \u0645\u0646\u0641\u06cc \u0627\u062b\u0631 \u0688\u0627\u0644 \u0633\u06a9\u062a\u06d2 \u06c1\u06cc\u06ba\u060c \u0627\u0633 \u0644\u06cc\u06d2 \u0627\u0633\u06d2 \u0627\u0686\u06be\u06cc \u0637\u0631\u062d \u0635\u0627\u0641 \u06a9\u0631\u0646\u0627 \u0636\u0631\u0648\u0631\u06cc \u06c1\u06d2\u06d4<\/p>\n<p>\u0627\u0633 \u0633\u06d2 \u0622\u067e \u06a9\u0648 \u0627\u0646\u062f\u0627\u0632\u06c1 \u06c1\u0648\u062a\u0627 \u06c1\u06d2 \u06a9\u06c1 \u0688\u06cc\u0679\u0627 \u06a9\u0627 \u062d\u0635\u06c1 \u0627\u0628 \u0628\u06be\u06cc \u06a9\u062a\u0646\u0627 \u0627\u06c1\u0645 \u06c1\u06d2 \u0627\u0648\u0631 LLM \u0688\u06cc\u0679\u0627 \u067e\u0631 \u06a9\u062a\u0646\u0627 \u0645\u0646\u062d\u0635\u0631 \u06c1\u06d2\u06d4<\/p>\n<p>\u06cc\u06c1\u0627\u06ba \u0635\u0641\u0627\u0626\u06cc \u06a9\u06cc \u062e\u0635\u0648\u0635\u06cc\u0627\u062a \u06c1\u06cc\u06ba \u062c\u0648 \u0645\u06cc\u06ba \u0646\u06d2 \u0627\u0633\u062a\u0639\u0645\u0627\u0644 \u06a9\u06cc \u06c1\u06cc\u06ba:<\/p>\n<pre><code class=\"language-python\">def clean_urdu_text(text: str) -> str:\n    \"\"\"\n    Clean a single Urdu text document.\n    \n    Steps:\n    1. Remove URLs\n    2. Remove HTML tags and entities\n    3. Remove email addresses\n    4. Normalize Unicode (NFKC normalization)\n    5. Remove non-Urdu characters (keep Urdu + punctuation + digits)\n    6. Normalize repeated punctuation (\u06d4\u06d4\u06d4, ..., - -, etc.)\n    7. Normalize whitespace\n    \"\"\"\n    import unicodedata\n    \n    # Step 1: Remove URLs\n    text = re.sub(r'https?:\/\/\\S+|www\\.\\S+', '', text)\n    \n    # Step 2: Remove HTML tags\n    text = re.sub(r'<[^>]+>', '', text)\n    # Remove HTML entities\n    text = re.sub(r'&[a-zA-Z]+;', ' ', text)\n    text = re.sub(r'\\d+;', ' ', text)\n    \n    # Step 3: Remove email addresses\n    text = re.sub(r'\\S+@\\S+', '', text)\n    \n    # Step 4: Unicode normalization (NFKC)\n    # This normalizes different representations of the same character\n    text = unicodedata.normalize('NFKC', text)\n    \n    # Step 5: Keep only Urdu characters, basic punctuation, digits, and whitespace\n    # Urdu Unicode ranges + Arabic punctuation + Western digits + basic punctuation\n    urdu_pattern = regex.compile(\n        r'[^'\n        r'\\u0600-\\u06FF'    # Arabic (includes Urdu)\n        r'\\u0750-\\u077F'    # Arabic Supplement\n        r'\\u08A0-\\u08FF'    # Arabic Extended-A\n        r'\\uFB50-\\uFDFF'    # Arabic Presentation Forms-A\n        r'\\uFE70-\\uFEFF'    # Arabic Presentation Forms-B\n        r'0-9\u06f0-\u06f9'           # Western and Eastern Arabic-Indic digits\n        r'\\s'               # Whitespace\n        r'\u06d4\u060c\u061f!\u066a'           # Urdu punctuation (full stop, comma, question mark, etc.)\n        r'.,:;!?\\-\\(\\)\"\\']'  # Basic Latin punctuation\n    )\n    text = urdu_pattern.sub(' ', text)\n    \n    # Step 6: Normalize repeated punctuation\n    text = re.sub(r'\u06d4{2,}', '\u06d4', text)\n    text = re.sub(r'\\.{2,}', '.', text)\n    text = re.sub(r'-\\s*-+', '-', text)\n    text = re.sub(r'-{2,}', '-', text)\n    text = re.sub(r'\u060c{2,}', '\u060c', text)\n    text = re.sub(r',{2,}', ',', text)\n    text = re.sub(r'\\s+[\u06d4\\.\\-,\u060c]\\s+', ' ', text)\n    \n    # Step 7: Normalize whitespace\n    text = re.sub(r'\\n{3,}', '\\n\\n', text)  # Max 2 newlines\n    text = re.sub(r'[^\\S\\n]+', ' ', text)    # Collapse spaces (but keep newlines)\n    text = text.strip()\n    \n    return text\n\n\ndef is_mostly_urdu(text: str, threshold: float = 0.5) -> bool:\n    \"\"\"\n    Check if text is mostly Urdu characters.\n    This filters out documents that are primarily English\/other languages.\n    \n    threshold: minimum fraction of characters that must be Urdu\n    \"\"\"\n    if len(text) == 0:\n        return False\n    urdu_chars = len(regex.findall(r'[\\u0600-\\u06FF\\u0750-\\u077F\\u08A0-\\u08FF\\uFB50-\\uFDFF\\uFE70-\\uFEFF]', text))\n    return (urdu_chars \/ len(text)) > threshold\n\n\n# Test the cleaning function\nsample = raw_texts[0]\nprint(\"=== BEFORE CLEANING ===\")\nprint(sample[:300])\nprint(\"\\n=== AFTER CLEANING ===\")\ncleaned = clean_urdu_text(sample)\nprint(cleaned[:300])\nprint(f\"\\nIs mostly Urdu: {is_mostly_urdu(cleaned)}\")\n<\/code><\/pre>\n<p>\u0635\u0641\u0627\u0626\u06cc \u06a9\u06d2 \u0628\u0639\u062f \u0688\u06cc\u0679\u0627 \u06a9\u0648 \u062f\u0648 \u0641\u0627\u0631\u0645\u06cc\u0679\u0633 \u0645\u06cc\u06ba \u0645\u062d\u0641\u0648\u0638 \u06a9\u06cc\u0627 \u06af\u06cc\u0627\u06d4 <strong>\u0679\u06cc\u06a9\u0633\u0679 \u0641\u0627\u0626\u0644<\/strong> (\u0679\u0648\u06a9\u0646\u0627\u0626\u0632\u0631 \u06a9\u06cc \u062a\u0631\u0628\u06cc\u062a \u06a9\u06d2 \u0644\u06cc\u06d2 \u0627\u0633\u062a\u0639\u0645\u0627\u0644 \u06a9\u06cc\u0627 \u062c\u0627\u062a\u0627 \u06c1\u06d2) \u0627\u0648\u0631 <strong>JSONL \u0641\u0627\u0626\u0644<\/strong> (\u067e\u0631\u06cc \u0679\u0631\u06cc\u0646\u0646\u06af \u0645\u06cc\u06ba \u0627\u0633\u062a\u0639\u0645\u0627\u0644 \u06a9\u06cc\u0627 \u062c\u0627\u062a\u0627 \u06c1\u06d2) \u0627\u06af\u0644\u06d2 \u0645\u0631\u062d\u0644\u06d2 \u0645\u06cc\u06ba \u06c1\u0631 \u0641\u0627\u0631\u0645\u06cc\u0679 \u06a9\u0648 \u0627\u06cc\u06a9 \u062e\u0627\u0635 \u0645\u0642\u0635\u062f \u06a9\u06d2 \u0644\u06cc\u06d2 \u0627\u0633\u062a\u0639\u0645\u0627\u0644 \u06a9\u06cc\u0627 \u062c\u0627\u062a\u0627 \u06c1\u06d2\u06d4<\/p>\n<h2 id=\"heading-2-tokenization\">2. \u0679\u0648\u06a9\u0646\u0627\u0626\u0632\u06cc\u0634\u0646<\/h2>\n<p>\u0635\u0641\u0627\u0626\u06cc \u06a9\u06d2 \u0628\u0639\u062f \u0627\u06af\u0644\u0627 \u0645\u0631\u062d\u0644\u06c1 \u06c1\u06d2\u06d4 <strong>\u0679\u0648\u06a9\u0646\u0627\u0626\u0632\u06cc\u0634\u0646<\/strong>. \u0679\u0648\u06a9\u0646\u0627\u0626\u0632\u06cc\u0634\u0646 \u0645\u062a\u0646 \u06a9\u0648 \u0646\u0645\u0628\u0631\u0648\u06ba \u0645\u06cc\u06ba \u062a\u0628\u062f\u06cc\u0644 \u06a9\u0631\u0646\u06d2 \u0627\u0648\u0631 \u0627\u0646 \u0646\u0645\u0628\u0631\u0648\u06ba \u06a9\u0648 \u0648\u0627\u067e\u0633 \u0645\u062a\u0646 \u0645\u06cc\u06ba \u062a\u0628\u062f\u06cc\u0644 \u06a9\u0631\u0646\u06d2 \u06a9\u0627 \u0627\u06cc\u06a9 \u0637\u0631\u06cc\u0642\u06c1 \u0641\u0631\u0627\u06c1\u0645 \u06a9\u0631\u062a\u0627 \u06c1\u06d2\u06d4<\/p>\n<p>\u06cc\u06c1 \u0636\u0631\u0648\u0631\u06cc \u06c1\u06d2 \u06a9\u06cc\u0648\u0646\u06a9\u06c1 \u0639\u0635\u0628\u06cc \u0646\u06cc\u0679 \u0648\u0631\u06a9 \u0645\u062a\u0646 \u06a9\u0648 \u0646\u06c1\u06cc\u06ba \u0633\u0645\u062c\u06be \u0633\u06a9\u062a\u06d2\u060c \u0635\u0631\u0641 \u0627\u0639\u062f\u0627\u062f\u06d4 \u0644\u06c1\u0630\u0627 \u0679\u0648\u06a9\u0646\u0627\u0626\u0632\u06cc\u0634\u0646 \u0628\u0646\u06cc\u0627\u062f\u06cc \u0637\u0648\u0631 \u067e\u0631 \u0627\u0646\u0633\u0627\u0646\u06cc \u0632\u0628\u0627\u0646 \u06a9\u06d2 \u062f\u0631\u0645\u06cc\u0627\u0646 \u062a\u0631\u062c\u0645\u06c1 \u06a9\u06cc \u067e\u0631\u062a \u06c1\u06d2 \u0627\u0648\u0631 \u062c\u0648 \u0645\u0627\u0688\u0644 \u0633\u0646\u0628\u06be\u0627\u0644 \u0633\u06a9\u062a\u0627 \u06c1\u06d2\u06d4<\/p>\n<p>\u0645\u062b\u0627\u0644 \u06a9\u06d2 \u0637\u0648\u0631 \u067e\u0631:<\/p>\n<pre><code class=\"language-plaintext\">\"hello world\"  \u2192  [\"hel\", \"lo\", \" world\"]  \u2192  [1245, 532, 995]\n\"\u0627\u0631\u062f\u0648 \u0632\u0628\u0627\u0646\"   \u2190  [\"\u0627\u0631\", \"\u062f\u0648\", \"\u0632\u0628\", \"\u0627\u0646\"]  \u2190  [412, 87, 953, 201]\n<\/code><\/pre>\n<h3 id=\"heading-tokenization-approaches\">\u0679\u0648\u06a9\u0646\u0627\u0626\u0632\u06cc\u0634\u0646 \u06a9\u0627 \u0637\u0631\u06cc\u0642\u06c1<\/h3>\n<p>\u0679\u0648\u06a9\u0646\u0627\u0626\u0632\u06cc\u0634\u0646 \u06a9\u06d2 \u062a\u06cc\u0646 \u0627\u06c1\u0645 \u0637\u0631\u06cc\u0642\u06d2 \u06c1\u06cc\u06ba:<\/p>\n<h4 id=\"heading-approach-1-character-level\">\u0646\u0642\u0637\u06c1 \u0646\u0638\u0631 1: \u06a9\u0631\u062f\u0627\u0631 \u06a9\u06cc \u0633\u0637\u062d<\/h4>\n<p>\u06cc\u06c1 \u0646\u0642\u0637\u06c1 \u0646\u0638\u0631 \u0622\u067e \u06a9\u0648 \u0645\u062a\u0646 \u06a9\u0648 \u0627\u0646\u0641\u0631\u0627\u062f\u06cc \u062d\u0631\u0648\u0641 \u0645\u06cc\u06ba \u062a\u0642\u0633\u06cc\u0645 \u06a9\u0631\u0646\u06d2 \u06a9\u06cc \u0627\u062c\u0627\u0632\u062a \u062f\u06cc\u062a\u0627 \u06c1\u06d2\u06d4<\/p>\n<ul>\n<li>\n<p><code>hello<\/code>    -> <code>['h', 'e', 'l', 'l', 'o']<\/code><\/p>\n<\/li>\n<li>\n<p><code>\u0627\u0631\u062f\u0648<\/code>    -> <code>['\u0627', '\u0631', '\u062f', '\u0648']<\/code><\/p>\n<\/li>\n<\/ul>\n<p>\u0645\u0633\u0626\u0644\u06c1 \u06cc\u06c1 \u06c1\u06d2 \u06a9\u06c1 \u0633\u0644\u0633\u0644\u06c1 \u0628\u06c1\u062a \u0637\u0648\u06cc\u0644 \u06c1\u0648 \u062c\u0627\u062a\u0627 \u06c1\u06d2\u06d4 1000 \u0627\u0644\u0641\u0627\u0638 \u06a9\u06cc \u062f\u0633\u062a\u0627\u0648\u06cc\u0632 \u0645\u06cc\u06ba 5000 \u0633\u06d2 \u0632\u06cc\u0627\u062f\u06c1 \u0679\u0648\u06a9\u0646 \u06c1\u0648 \u0633\u06a9\u062a\u06d2 \u06c1\u06cc\u06ba\u06d4 \u0645\u0627\u0688\u0644 \u06a9\u0648 \u062d\u0631\u0648\u0641 \u06a9\u0648 \u0627\u0644\u0641\u0627\u0638 \u0645\u06cc\u06ba \u062c\u0648\u0691\u0646\u06d2 \u06a9\u0627 \u0637\u0631\u06cc\u0642\u06c1 \u0633\u06cc\u06a9\u06be\u0646\u06d2 \u06a9\u06cc \u0636\u0631\u0648\u0631\u062a \u06c1\u06d2\u060c \u062c\u0648 \u06a9\u06c1 \u0628\u06c1\u062a \u0645\u0634\u06a9\u0644 \u06c1\u06d2\u06d4<\/p>\n<h4 id=\"heading-approach-2-word-level\">\u0646\u0642\u0637\u06c1 \u0646\u0638\u0631 2: \u0644\u0641\u0638 \u06a9\u06cc \u0633\u0637\u062d<\/h4>\n<p>\u06cc\u06c1 \u0646\u0642\u0637\u06c1 \u0646\u0638\u0631 \u0627\u0644\u0641\u0627\u0638 \u06a9\u06d2 \u062f\u0631\u0645\u06cc\u0627\u0646 \u062c\u06af\u06c1 \u06a9\u06cc \u0628\u0646\u06cc\u0627\u062f \u067e\u0631 \u062a\u0642\u0633\u06cc\u0645 \u06c1\u0648\u062a\u0627 \u06c1\u06d2\u06d4<\/p>\n<ul>\n<li>\n<p><code>hello how are you<\/code>    -> <code>['hello', 'how', 'are', 'you']<\/code><\/p>\n<\/li>\n<li>\n<p><code>\u0627\u0631\u062f\u0648 \u0628\u06c1\u062a \u0627\u0686\u06be\u06cc \u0632\u0628\u0627\u0646 \u06c1\u06d2<\/code>    -> <code>['\u0627\u0631\u062f\u0648', '\u0628\u06c1\u062a', '\u0627\u0686\u06be\u06cc', '\u0632\u0628\u0627\u0646', '\u06c1\u06d2']<\/code><\/p>\n<\/li>\n<\/ul>\n<p>\u0627\u0633 \u06a9\u06d2 \u0633\u0627\u062a\u06be \u0645\u0633\u0626\u0644\u06c1 \u06cc\u06c1 \u06c1\u06d2 \u06a9\u06c1 \u0632\u0628\u0627\u0646\u0648\u06ba \u06a9\u06cc \u0630\u062e\u06cc\u0631\u06c1 \u0627\u0644\u0641\u0627\u0638 \u0628\u06c1\u062a \u0632\u06cc\u0627\u062f\u06c1 \u06c1\u06cc\u06ba (\u0627\u0631\u062f\u0648 \u0645\u06cc\u06ba 100,000 \u0633\u06d2 \u0632\u06cc\u0627\u062f\u06c1 \u0645\u0646\u0641\u0631\u062f \u0627\u0644\u0641\u0627\u0638 \u06c1\u06cc\u06ba\u061b \u0627\u0646\u06af\u0631\u06cc\u0632\u06cc \u0645\u06cc\u06ba 170,000 \u0645\u0646\u0641\u0631\u062f \u0627\u0644\u0641\u0627\u0638 \u06c1\u06cc\u06ba)\u06d4 \u0645\u0627\u0688\u0644 \u0646\u0626\u06d2 \u06cc\u0627 \u0646\u0627\u06cc\u0627\u0628 \u0627\u0644\u0641\u0627\u0638 \u06a9\u0648 \u0646\u06c1\u06cc\u06ba \u0633\u0646\u0628\u06be\u0627\u0644 \u0633\u06a9\u062a\u0627 (\u0644\u0641\u0638 \u0633\u06d2 \u0628\u0627\u06c1\u0631 \u06a9\u0627 \u0645\u0633\u0626\u0644\u06c1)\u06d4<\/p>\n<h4 id=\"heading-approach-3-subword-using-bpe-byte-pair-encoding\">\u0646\u0642\u0637\u06c1 \u0646\u0638\u0631 3: \u0628\u0627\u0626\u0679 \u067e\u06cc\u0626\u0631 \u0627\u0646\u06a9\u0648\u0688\u0646\u06af (BPE) \u06a9\u0627 \u0627\u0633\u062a\u0639\u0645\u0627\u0644 \u06a9\u0631\u062a\u06d2 \u06c1\u0648\u0626\u06d2 \u0630\u06cc\u0644\u06cc \u0627\u0644\u0641\u0627\u0638<\/h4>\n<p>\u0627\u0633 \u0646\u0642\u0637\u06c1 \u0646\u0638\u0631 \u06a9\u06d2 \u0633\u0627\u062a\u06be\u060c \u0645\u0627\u0688\u0644 \u0688\u06cc\u0679\u0627 \u0633\u06d2 \u0639\u0627\u0645 \u06a9\u0631\u062f\u0627\u0631 \u06a9\u06cc \u062a\u0631\u062a\u06cc\u0628 \u0633\u06cc\u06a9\u06be\u062a\u0627 \u06c1\u06d2\u06d4<\/p>\n<ul>\n<li>\n<p><code>unhappiness<\/code>    \u0627\u0633\u06d2 \u0627\u0633 \u0637\u0631\u062d \u062a\u0642\u0633\u06cc\u0645 \u06a9\u06cc\u0627 \u062c\u0627 \u0633\u06a9\u062a\u0627 \u06c1\u06d2: <code>['un', 'happi', 'ness']<\/code><\/p>\n<\/li>\n<li>\n<p><code>\u0645\u06a9\u0645\u0644<\/code>    \u0627\u0633\u06d2 \u0627\u0633 \u0637\u0631\u062d \u062a\u0642\u0633\u06cc\u0645 \u06a9\u06cc\u0627 \u062c\u0627 \u0633\u06a9\u062a\u0627 \u06c1\u06d2: <code>['\u0645\u06a9\u0645', '\u0644']<\/code> \u06cc\u0627\u060c \u0627\u06af\u0631 \u06cc\u06c1 \u06a9\u0627\u0641\u06cc \u0639\u0627\u0645 \u06c1\u06d2\u060c \u062a\u0648 \u0627\u0633\u06d2 \u0645\u06a9\u0645\u0644 \u0631\u06a9\u06be\u06cc\u06ba\u06d4<\/p>\n<\/li>\n<\/ul>\n<p>\u06cc\u06c1 \u0627\u06cc\u06a9 \u0686\u06be\u0648\u0679\u06cc \u0630\u062e\u06cc\u0631\u06c1 \u0627\u0644\u0641\u0627\u0638 \u06c1\u06d2 (32K \u0679\u0648\u06a9\u0646\u0632 \u06a9\u0627 \u0627\u0633\u062a\u0639\u0645\u0627\u0644 \u06a9\u0631\u062a\u06d2 \u06c1\u0648\u0626\u06d2) \u0627\u0648\u0631 \u062a\u0645\u0627\u0645 \u0627\u0644\u0641\u0627\u0638\u060c \u06cc\u06c1\u0627\u06ba \u062a\u06a9 \u06a9\u06c1 \u0646\u0626\u06d2 \u0627\u0644\u0641\u0627\u0638 \u0628\u06be\u06cc \u0633\u0646\u0628\u06be\u0627\u0644 \u0633\u06a9\u062a\u06cc \u06c1\u06d2\u06d4 \u0639\u0627\u0645 \u0627\u0644\u0641\u0627\u0638 \u06a9\u0648 \u0633\u0646\u06af\u0644 \u0679\u0648\u06a9\u0646 \u06a9\u06d2 \u0637\u0648\u0631 \u067e\u0631 \u0631\u06a9\u06be\u0627 \u062c\u0627\u062a\u0627 \u06c1\u06d2\u06d4<\/p>\n<p>BPE \u0627\u06cc\u06a9 \u0635\u0646\u0639\u062a\u06cc \u0645\u0639\u06cc\u0627\u0631 \u06c1\u06d2 \u062c\u0633\u06d2 GPT\u060c LLaMA\u060c \u0627\u0648\u0631 \u062c\u062f\u06cc\u062f \u062a\u0631\u06cc\u0646 LLMs \u0627\u0633\u062a\u0639\u0645\u0627\u0644 \u06a9\u0631\u062a\u06d2 \u06c1\u06cc\u06ba\u06d4 \u06cc\u06c1\u0627\u06ba \u06cc\u06c1 \u06c1\u06d2 \u06a9\u06c1 \u06cc\u06c1 \u0642\u062f\u0645 \u0628\u06c1 \u0642\u062f\u0645 \u06a9\u06cc\u0633\u06d2 \u06a9\u0627\u0645 \u06a9\u0631\u062a\u0627 \u06c1\u06d2:<\/p>\n<ol>\n<li>\n<p><strong>\u0627\u06cc\u06a9 \u062e\u0637 \u06a9\u06d2 \u0633\u0627\u062a\u06be \u0634\u0631\u0648\u0639 \u06c1\u0648\u062a\u0627 \u06c1\u06d2<\/strong>: \u0627\u0644\u0641\u0627\u0638 = \u062a\u0645\u0627\u0645 \u0627\u0646\u0641\u0631\u0627\u062f\u06cc \u062d\u0631\u0648\u0641<\/p>\n<\/li>\n<li>\n<p><strong>\u062c\u0648\u0691\u06cc \u06a9\u06cc \u06af\u0646\u062a\u06cc<\/strong>: \u0627\u06a9\u062b\u0631 \u0645\u0644\u062d\u0642\u06c1 \u0679\u0648\u06a9\u0646 \u062c\u0648\u0691\u06d2 \u062a\u0644\u0627\u0634 \u06a9\u0631\u06cc\u06ba\u06d4<\/p>\n<\/li>\n<li>\n<p><strong>\u062c\u0630\u0628<\/strong>: \u062c\u0648\u0691\u06d2 \u06a9\u0648 \u0627\u06cc\u06a9 \u0646\u0626\u06d2 \u0679\u0648\u06a9\u0646 \u0645\u06cc\u06ba \u062c\u0648\u0691\u062a\u0627 \u06c1\u06d2\u06d4<\/p>\n<\/li>\n<li>\n<p><strong>\u062f\u06c1\u0631\u0627\u0626\u06cc\u06ba<\/strong>: \u062c\u0628 \u062a\u06a9 \u06a9\u06c1 \u0627\u0644\u0641\u0627\u0638 \u0645\u0637\u0644\u0648\u0628\u06c1 \u0633\u0627\u0626\u0632 \u062a\u06a9 \u0646\u06c1 \u067e\u06c1\u0646\u0686 \u062c\u0627\u0626\u06cc\u06ba\u06d4<\/p>\n<\/li>\n<\/ol>\n<p>\u0645\u062b\u0627\u0644\u0648\u06ba \u0645\u06cc\u06ba \u0634\u0627\u0645\u0644 \u06c1\u06cc\u06ba:<\/p>\n<pre><code class=\"language-plaintext\">Start:  \u0627 \u0631 \u062f \u0648   \u0632 \u0628 \u0627 \u0646\nMerge 1: '\u0627 \u0631' -> '\u0627\u0631'    (most common pair)\nResult: \u0627\u0631 \u062f \u0648   \u0632 \u0628 \u0627 \u0646\nMerge 2: '\u0632 \u0628' -> '\u0632\u0628'    (next most common)\nResult: \u0627\u0631 \u062f \u0648   \u0632\u0628 \u0627 \u0646\n...and so on for 32,000 merges\n<\/code><\/pre>\n<p>\u06cc\u06c1 \u0648\u06c1 \u0637\u0631\u06cc\u0642\u06c1 \u06c1\u06d2 \u062c\u0633\u06d2 \u0645\u06cc\u06ba \u0627\u067e\u0646\u06d2 \u0627\u0631\u062f\u0648 \u0627\u06cc\u0644 \u0627\u06cc\u0644 \u0627\u06cc\u0645 \u06a9\u06d2 \u0644\u06cc\u06d2 \u0627\u0633\u062a\u0639\u0645\u0627\u0644 \u06a9\u0631\u0648\u06ba \u06af\u0627\u06d4 \u0645\u06cc\u06ba \u0646\u06d2 \u062a\u06cc\u0627\u0631 \u06a9\u0631\u062f\u06c1 \u0627\u0631\u062f\u0648 \u06a9\u0627\u0631\u067e\u0633 \u067e\u0631 32K \u0679\u0648\u06a9\u0646 \u06a9\u06d2 \u0627\u0644\u0641\u0627\u0638 \u06a9\u06d2 \u0633\u0627\u0626\u0632 \u06a9\u06d2 \u0633\u0627\u062a\u06be \u0627\u06cc\u06a9 BPE \u0679\u0648\u06a9\u0646\u0627\u0626\u0632\u0631 \u06a9\u0648 \u062a\u0631\u0628\u06cc\u062a \u062f\u06cc\u06d4<\/p>\n<h3 id=\"heading-special-tokens\">\u062e\u0635\u0648\u0635\u06cc \u0679\u0648\u06a9\u0646<\/h3>\n<p>BPE \u06a9\u06d2 \u0633\u0627\u062a\u06be \u0622\u067e \u06a9\u0648 \u06cc\u06c1 \u0628\u06be\u06cc \u0634\u0627\u0645\u0644 \u06a9\u0631\u0646\u0627 \u0686\u0627\u06c1\u0626\u06d2: <strong>\u062e\u0635\u0648\u0635\u06cc \u0679\u0648\u06a9\u0646<\/strong>. \u06cc\u06c1 \u0679\u0648\u06a9\u0646 \u062a\u0631\u0628\u06cc\u062a \u0627\u0648\u0631 \u062a\u062e\u0645\u06cc\u0646\u06c1 \u06a9\u06d2 \u062f\u0648\u0631\u0627\u0646 \u062f\u0631\u06a9\u0627\u0631 \u0645\u0627\u0688\u0644 \u0688\u06be\u0627\u0646\u0686\u06d2 \u06a9\u06cc \u0645\u0639\u0644\u0648\u0645\u0627\u062a \u0641\u0631\u0627\u06c1\u0645 \u06a9\u0631\u062a\u06d2 \u06c1\u06cc\u06ba\u06d4<\/p>\n<table>\n<thead>\n<tr>\n<th>\u0679\u0648\u06a9\u0646<\/th>\n<th>\u0645\u0642\u0635\u062f<\/th>\n<th>\u0622\u067e \u06a9\u0648 \u0627\u0633 \u06a9\u06cc \u0636\u0631\u0648\u0631\u062a \u06a9\u06cc\u0648\u06ba \u06c1\u06d2\u06d4<\/th>\n<\/tr>\n<\/thead>\n<tbody>\n<tr>\n<td><code><pad\/><\/code><\/td>\n<td>\u0645\u0633\u0627\u0648\u06cc \u0644\u0645\u0628\u0627\u0626\u06cc \u06a9\u06d2 \u0633\u0644\u0633\u0644\u06d2 \u06a9\u06d2 \u0644\u06cc\u06d2 \u067e\u06cc\u0688\u0646\u06af<\/td>\n<td>\u0628\u06cc\u0686 \u067e\u0631\u0648\u0633\u06cc\u0633\u0646\u06af \u06a9\u06d2 \u0644\u06cc\u06d2\u060c \u062a\u0645\u0627\u0645 \u062a\u0631\u062a\u06cc\u0628\u0648\u06ba \u06a9\u06cc \u0644\u0645\u0628\u0627\u0626\u06cc \u0627\u06cc\u06a9 \u062c\u06cc\u0633\u06cc \u06c1\u0648\u0646\u06cc \u0686\u0627\u06c1\u06cc\u06d2\u06d4 \u0645\u062e\u062a\u0635\u0631 \u0633\u0644\u0633\u0644\u06d2 \u0627\u0633 \u0633\u06d2 \u0628\u06be\u0631\u06d2 \u06c1\u0648\u0626\u06d2 \u06c1\u06cc\u06ba: <code><pad\/><\/code> \u0679\u0648\u06a9\u0646<\/td>\n<\/tr>\n<tr>\n<td><code><unk\/><\/code><\/td>\n<td>\u0646\u0627\u0645\u0639\u0644\u0648\u0645 \u0644\u0641\u0638 \u06a9\u0648 \u062a\u0628\u062f\u06cc\u0644 \u06a9\u0631\u06cc\u06ba\u06d4<\/td>\n<td>\u062c\u0628 \u0645\u0627\u0688\u0644 \u06a9\u0627 \u0633\u0627\u0645\u0646\u0627 \u06a9\u0633\u06cc \u0627\u06cc\u0633\u06d2 \u0679\u0648\u06a9\u0646 \u0633\u06d2 \u06c1\u0648\u062a\u0627 \u06c1\u06d2 \u062c\u0648 \u0627\u0633 \u06a9\u06d2 \u0630\u062e\u06cc\u0631\u06c1 \u0627\u0644\u0641\u0627\u0638 \u0645\u06cc\u06ba \u0646\u06c1\u06cc\u06ba \u06c1\u06d2\u060c \u062a\u0648 \u0627\u0633\u06d2 \u0627\u0633 \u0637\u0631\u062d \u0646\u0642\u0634\u06c1 \u0628\u0646\u0627\u06cc\u0627 \u062c\u0627\u062a\u0627 \u06c1\u06d2: <code><unk\/><\/code> \u0646\u0627\u06a9\u0627\u0645 \u06c1\u0648\u0646\u06d2 \u06a9\u06d2 \u0628\u062c\u0627\u0626\u06d2\u06d4<\/td>\n<\/tr>\n<tr>\n<td><code><bos\/><\/code><\/td>\n<td>\u0627\u06cc\u06a9 \u062a\u0631\u062a\u06cc\u0628 \u06a9\u06d2 \u0622\u063a\u0627\u0632 \u06a9\u0648 \u0646\u0634\u0627\u0646 \u0632\u062f \u06a9\u0631\u062a\u0627 \u06c1\u06d2\u06d4<\/td>\n<td>\u06cc\u06c1 \u0645\u0627\u0688\u0644 \u06a9\u0648 \u0628\u062a\u0627\u062a\u0627 \u06c1\u06d2 \u06a9\u06c1 \u0627\u0646 \u067e\u0679 \u06a9\u06c1\u0627\u06ba \u0633\u06d2 \u0634\u0631\u0648\u0639 \u06c1\u0648\u062a\u0627 \u06c1\u06d2\u060c \u062c\u0633 \u0633\u06d2 \u0632\u06cc\u0627\u062f\u06c1 \u0642\u0627\u0628\u0644 \u0627\u0639\u062a\u0645\u0627\u062f \u0646\u0633\u0644 \u067e\u06cc\u062f\u0627 \u06c1\u0648\u062a\u06cc \u06c1\u06d2\u06d4<\/td>\n<\/tr>\n<tr>\n<td><code><eos\/><\/code><\/td>\n<td>\u0627\u06cc\u06a9 \u062a\u0631\u062a\u06cc\u0628 \u06a9\u06d2 \u0627\u062e\u062a\u062a\u0627\u0645 \u06a9\u0648 \u0646\u0634\u0627\u0646 \u0632\u062f \u06a9\u0631\u062a\u0627 \u06c1\u06d2\u06d4<\/td>\n<td>\u0645\u0627\u0688\u0644 \u06a9\u0648 \u0628\u062a\u0627\u062a\u0627 \u06c1\u06d2 \u06a9\u06c1 \u06a9\u0628 \u067e\u06cc\u062f\u0627 \u06a9\u0631\u0646\u0627 \u0628\u0646\u062f \u06a9\u0631\u0646\u0627 \u06c1\u06d2\u06d4 \u0628\u0635\u0648\u0631\u062a \u062f\u06cc\u06af\u0631\u060c \u0622\u0624\u0679 \u067e\u0679 \u06c1\u0645\u06cc\u0634\u06c1 \u06a9\u06d2 \u0644\u06cc\u06d2 \u0686\u0644 \u0633\u06a9\u062a\u0627 \u06c1\u06d2 \u06cc\u0627 \u0628\u06d2 \u062a\u0631\u062a\u06cc\u0628 \u0637\u0648\u0631 \u067e\u0631 \u0631\u06a9 \u0633\u06a9\u062a\u0627 \u06c1\u06d2\u06d4<\/td>\n<\/tr>\n<tr>\n<td><code><sep\/><\/code><\/td>\n<td>\u0627\u0644\u06af \u0627\u0644\u06af \u0637\u0628\u0642\u0627\u062a<\/td>\n<td>\u0686\u06cc\u0679 \u0641\u0627\u0631\u0645\u06cc\u0679 \u0633\u0633\u0679\u0645 \u06a9\u06d2 \u0627\u0634\u0627\u0631\u06d2\u060c \u0635\u0627\u0631\u0641 \u06a9\u06d2 \u067e\u06cc\u063a\u0627\u0645\u0627\u062a\u060c \u0627\u0648\u0631 \u0645\u0639\u0627\u0648\u0646 \u062c\u0648\u0627\u0628\u0627\u062a \u06a9\u0648 \u0627\u0644\u06af \u06a9\u0631\u062a\u0627 \u06c1\u06d2 \u062a\u0627\u06a9\u06c1 \u0645\u0627\u0688\u0644 \u06a9\u0648 \u0645\u0639\u0644\u0648\u0645 \u06c1\u0648 \u06a9\u06c1 \u0627\u0633 \u06a9\u0627 \u06a9\u06cc\u0627 \u06a9\u0631\u062f\u0627\u0631 \u06c1\u06d2\u06d4<\/td>\n<\/tr>\n<tr>\n<td>`<\/td>\n<td>\u0635\u0627\u0631\u0641<\/td>\n<td>>`<\/td>\n<\/tr>\n<tr>\n<td>`<\/td>\n<td>\u0627\u0633\u0633\u0679\u0646\u0679<\/td>\n<td>>`<\/td>\n<\/tr>\n<tr>\n<td>`<\/td>\n<td>\u0646\u0638\u0627\u0645<\/td>\n<td>>`<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<h3 id=\"heading-bpe-tokenizer-configuration\">BPE \u0679\u0648\u06a9\u0646\u0627\u0626\u0632\u0631 \u06a9\u0646\u0641\u06cc\u06af\u0631\u06cc\u0634\u0646<\/h3>\n<p>\u0645\u06cc\u06ba \u0646\u06d2 \u0627\u0644\u0641\u0627\u0638 \u06a9\u0627 \u0633\u0627\u0626\u0632 \u0627\u0633 \u067e\u0631 \u0633\u06cc\u0679 \u06a9\u06cc\u0627: <strong>32K<\/strong>. \u0627\u0633 \u06a9\u0627 \u06a9\u06cc\u0627 \u0645\u0637\u0644\u0628 \u06c1\u06d2\u061f \u0627\u0633 \u06a9\u0627 \u0645\u0637\u0644\u0628 \u06c1\u06d2 \u06a9\u06c1 \u0645\u0627\u0688\u0644 \u06a9\u06d2 \u0627\u0644\u0641\u0627\u0638 \u06a9\u06cc \u062a\u0644\u0627\u0634 \u06a9\u06d2 \u062c\u062f\u0648\u0644 \u0645\u06cc\u06ba 32K \u0679\u0648\u06a9\u0646\u0632 \u06c1\u06cc\u06ba\u06d4<\/p>\n<p>\u06cc\u06c1 \u0632\u0628\u0627\u0646 \u06a9\u06cc \u06a9\u0648\u0631\u06cc\u062c \u0627\u0648\u0631 \u0645\u0627\u0688\u0644 \u0633\u0627\u0626\u0632 \u06a9\u06d2 \u062f\u0631\u0645\u06cc\u0627\u0646 \u0627\u06cc\u06a9 \u0627\u0686\u06be\u0627 \u062a\u0648\u0627\u0632\u0646 \u06c1\u06d2\u06d4 \u0627\u0644\u0641\u0627\u0638 \u06a9\u06d2 \u0633\u0627\u0626\u0632 \u06a9\u0648 \u0628\u0691\u06be\u0627\u0646\u06d2 \u0633\u06d2 \u0633\u0631\u0627\u06cc\u062a \u06a9\u0631\u0646\u06d2 \u0648\u0627\u0644\u06cc \u067e\u0631\u062a \u0627\u0648\u0631 \u0622\u0624\u0679 \u067e\u0679 \u067e\u0631\u062a \u062f\u0648\u0646\u0648\u06ba \u0645\u06cc\u06ba \u0627\u0636\u0627\u0641\u06c1 \u06c1\u0648\u062a\u0627 \u06c1\u06d2\u060c \u062c\u0633 \u06a9\u06d2 \u0646\u062a\u06cc\u062c\u06d2 \u0645\u06cc\u06ba \u0633\u06cc\u06a9\u06be\u0646\u06d2 \u06a9\u06d2 \u0644\u06cc\u06d2 \u0645\u0632\u06cc\u062f \u067e\u06cc\u0631\u0627\u0645\u06cc\u0679\u0631\u0632 \u06c1\u0648\u062a\u06d2 \u06c1\u06cc\u06ba\u06d4 \u0633\u06cc\u06a9\u06be\u0646\u06d2 \u06a9\u06d2 \u0645\u0646\u0635\u0648\u0628\u0648\u06ba \u06a9\u06d2 \u0644\u06cc\u06d2\u060c 32K \u06c1\u0631 \u0686\u06cc\u0632 \u06a9\u0648 \u0642\u0627\u0628\u0644 \u0627\u0646\u062a\u0638\u0627\u0645 \u0631\u06a9\u06be\u062a\u0627 \u06c1\u06d2\u06d4<\/p>\n<p><code>MIN_FREQUENCY<\/code>    \u06cc\u06c1 2 \u067e\u0631 \u0633\u06cc\u0679 \u06a9\u06cc\u0627 \u06af\u06cc\u0627 \u06c1\u06d2\u06d4 \u0627\u0633 \u06a9\u0627 \u0645\u0637\u0644\u0628 \u06c1\u06d2 \u06a9\u06c1 \u0634\u0627\u0645\u0644 \u06a9\u06cc\u06d2 \u062c\u0627\u0646\u06d2 \u06a9\u06d2 \u0644\u06cc\u06d2 \u0627\u06cc\u06a9 \u0679\u0648\u06a9\u0646 \u06a9\u0645 \u0627\u0632 \u06a9\u0645 \u062f\u0648 \u0628\u0627\u0631 \u0638\u0627\u06c1\u0631 \u06c1\u0648\u0646\u0627 \u0686\u0627\u06c1\u06cc\u06d2\u06d4 \u06cc\u06c1 \u0648\u0646 \u0622\u0641 \u0634\u0648\u0631 \u0679\u0648\u06a9\u0646\u0632 \u06a9\u0648 \u0641\u0644\u0679\u0631 \u06a9\u0631\u062a\u0627 \u06c1\u06d2 \u062c\u0648 \u0627\u0644\u0641\u0627\u0638 \u06a9\u06cc \u0633\u0644\u0627\u0679\u0633 \u06a9\u0648 \u0636\u0627\u0626\u0639 \u06a9\u0631\u062a\u06d2 \u06c1\u06cc\u06ba\u06d4<\/p>\n<p><strong>\u062d\u0648\u0627\u0644\u06c1 \u06a9\u06d2 \u0644\u06cc\u06d2:<\/strong> GPT-2 50K \u0679\u0648\u06a9\u0646\u0632 \u06a9\u0627 \u0630\u062e\u06cc\u0631\u06c1 \u0627\u0633\u062a\u0639\u0645\u0627\u0644 \u06a9\u0631\u062a\u0627 \u06c1\u06d2\u060c \u062c\u0628\u06a9\u06c1 LLaMA 32K \u0679\u0648\u06a9\u0646 \u0627\u0633\u062a\u0639\u0645\u0627\u0644 \u06a9\u0631\u062a\u0627 \u06c1\u06d2\u06d4 \u06c1\u0645 \u0646\u06d2 \u062c\u0648 32K \u0645\u0646\u062a\u062e\u0628 \u06a9\u06cc\u0627 \u06c1\u06d2 \u0648\u06c1 \u067e\u0631\u0648\u0688\u06a9\u0634\u0646 \u0645\u0627\u0688\u0644 \u0633\u06d2 \u0645\u06cc\u0644 \u06a9\u06be\u0627\u062a\u0627 \u06c1\u06d2\u06d4<\/p>\n<pre><code class=\"language-python\">VOCAB_SIZE = 32_000  # Number of tokens in our vocabulary\nMIN_FREQUENCY = 2    # Token must appear at least twice (filters noise)\n\n# Special tokens - these have reserved IDs\nSPECIAL_TOKENS = [\n    \"<pad>\",    # ID 0: padding\n    \"<unk>\",    # ID 1: unknown\n    \"<bos>\",    # ID 2: beginning of sequence \n    \"<eos>\",    # ID 3: end of sequence\n    \"<sep>\",    # ID 4: separator (for chat format)\n    \"<|user|>\",     # ID 5: user turn marker (for chat)\n    \"<|assistant|>\", # ID 6: assistant turn marker (for chat)\n    \"<|system|>\",    # ID 7: system prompt marker (for chat)\n]\n<\/sep><\/eos><\/bos><\/unk><\/pad><\/code><\/pre>\n<h3 id=\"heading-building-the-tokenizer\">\u0679\u0648\u06a9\u0646\u0627\u0626\u0632\u0631 \u0628\u0646\u0627\u0646\u0627<\/h3>\n<p>\u0627\u06af\u0644\u0627\u060c \u06c1\u0645 \u067e\u06c1\u0644\u06d2 \u0628\u0646\u0627\u0626\u06cc \u06af\u0626\u06cc \u06a9\u0644\u06cc\u0646 \u0627\u067e \u0679\u06cc\u06a9\u0633\u0679 \u0641\u0627\u0626\u0644 \u06a9\u0627 \u0627\u0633\u062a\u0639\u0645\u0627\u0644 \u06a9\u0631\u062a\u06d2 \u06c1\u0648\u0626\u06d2 \u0679\u0648\u06a9\u0646\u0627\u0626\u0632\u0631 \u0628\u0646\u0627\u0626\u06cc\u06ba \u06af\u06d2\u06d4 \u0633\u0628 \u0633\u06d2 \u067e\u06c1\u0644\u06d2\u060c \u0645\u0637\u0644\u0648\u0628\u06c1 \u0644\u0627\u0626\u0628\u0631\u06cc\u0631\u06cc\u0648\u06ba \u06a9\u0648 \u062f\u0631\u0622\u0645\u062f \u06a9\u0631\u06cc\u06ba \u0627\u0648\u0631 \u0641\u0627\u0626\u0644 \u06a9\u0627 \u0631\u0627\u0633\u062a\u06c1 \u0633\u06cc\u0679 \u06a9\u0631\u06cc\u06ba\u06d4<\/p>\n<pre><code class=\"language-python\">import os\nfrom pathlib import Path\nfrom tokenizers import (\n    Tokenizer,\n    models,\n    trainers,\n    pre_tokenizers,\n    decoders,\n    processors,\n    normalizers,\n)\n\nPROJECT_ROOT = Path(\".\").resolve().parent\nCLEANED_DIR = PROJECT_ROOT \/ \"data\" \/ \"cleaned\"\nTOKENIZER_DIR = PROJECT_ROOT \/ \"tokenizer\" \/ \"urdu_tokenizer\"\nTOKENIZER_DIR.mkdir(parents=True, exist_ok=True)\n\nCORPUS_FILE = str(CLEANED_DIR \/ \"urdu_corpus.txt\")\nprint(f\"Corpus file: {CORPUS_FILE}\")\nprint(f\"Tokenizer output: {TOKENIZER_DIR}\")\n\n# Verify corpus exists\nassert os.path.exists(CORPUS_FILE), f\"Corpus not found at {CORPUS_FILE}. Run notebook 01 first!\"\nfile_size_mb = os.path.getsize(CORPUS_FILE) \/ 1024 \/ 1024\nprint(f\"Corpus size: {file_size_mb:.1f} MB\")\n<\/code><\/pre>\n<p>\u0627\u0628 \u0679\u0648\u06a9\u0646\u0627\u0626\u0632\u0631 \u062c\u0632\u0648 \u06a9\u0648 \u06a9\u0646\u0641\u06cc\u06af\u0631 \u06a9\u0631\u062a\u06d2 \u06c1\u06cc\u06ba\u06d4<\/p>\n<pre><code class=\"language-python\"># ============================================================\n# Build the tokenizer\n# ============================================================\n\n# Step 1: Create a BPE model (the core algorithm)\ntokenizer = Tokenizer(models.BPE(unk_token=\"<unk>\"))\n\n# Step 2: Add normalizer (text cleaning before tokenization)\n# NFKC normalizes Unicode (e.g., different forms of the same Arabic letter)\ntokenizer.normalizer = normalizers.NFKC()\n\n# Step 3: Pre-tokenizer (how to split text before BPE)\n# We use Metaspace which replaces spaces with \u2581 and splits on them\n# This preserves space information so we can reconstruct the original text\ntokenizer.pre_tokenizer = pre_tokenizers.Metaspace()\n\n# Step 4: Decoder (how to convert tokens back to text)\n# Metaspace decoder converts \u2581 back to spaces\ntokenizer.decoder = decoders.Metaspace()\n\n# Step 5: Configure the trainer\ntrainer = trainers.BpeTrainer(\n    vocab_size=VOCAB_SIZE,\n    min_frequency=MIN_FREQUENCY,\n    special_tokens=SPECIAL_TOKENS,\n    show_progress=True,\n    initial_alphabet=[]  # Learn alphabet from data\n)\n\nprint(\"Tokenizer configured. Ready to train!\")\n<\/unk><\/code><\/pre>\n<h3 id=\"heading-training-the-tokenizer\">\u0679\u0648\u06a9\u0646\u0627\u0626\u0632\u0631 \u06a9\u06cc \u062a\u0631\u0628\u06cc\u062a<\/h3>\n<p>\u0679\u0648\u06a9\u0646\u0627\u0626\u0632\u0631 \u06a9\u0646\u0641\u06cc\u06af\u0631 \u06c1\u0648\u0646\u06d2 \u06a9\u06d2 \u0628\u0639\u062f\u060c \u0627\u06af\u0644\u0627 \u0645\u0631\u062d\u0644\u06c1 \u0627\u0633\u06d2 \u0686\u0644\u0627\u0646\u0627 \u06c1\u06d2\u06d4 \u0622\u067e \u06a9\u06d2 \u0622\u0644\u06d2 \u067e\u0631 \u0645\u0646\u062d\u0635\u0631 \u06c1\u06d2\u060c \u0627\u0633 \u0645\u06cc\u06ba \u062a\u0642\u0631\u06cc\u0628\u0627\u064b 5 \u0633\u06d2 10 \u0645\u0646\u0679 \u0644\u06af\u06cc\u06ba \u06af\u06d2\u06d4<\/p>\n<pre><code class=\"language-python\">print(\"Training tokenizer... (this may take a few minutes)\")\ntokenizer.train([CORPUS_FILE], trainer)\n\nprint(f\"\\n Tokenizer trained!\")\nprint(f\"  Vocabulary size: {tokenizer.get_vocab_size():,}\")\n<\/code><\/pre>\n<h3 id=\"heading-configuring-post-processing-auto-wrapping-with-boseos\">\u067e\u0648\u0633\u0679 \u067e\u0631\u0648\u0633\u06cc\u0633\u0646\u06af \u06a9\u0646\u0641\u06cc\u06af\u0631\u06cc\u0634\u0646 (BOS\/EOS \u06a9\u06d2 \u0633\u0627\u062a\u06be \u062e\u0648\u062f\u06a9\u0627\u0631 \u0631\u06cc\u067e\u0646\u06af)<\/h3>\n<p>\u0627\u06af\u0644\u0627\u060c \u06c1\u0645 \u067e\u0648\u0633\u0679 \u067e\u0631\u0648\u0633\u06cc\u0633\u0646\u06af \u06a9\u0648 \u062a\u0631\u062a\u06cc\u0628 \u062f\u06cc\u062a\u06d2 \u06c1\u06cc\u06ba \u062a\u0627\u06a9\u06c1 \u0679\u0648\u06a9\u0646\u0627\u0626\u0632\u0631 \u062e\u0648\u062f \u0628\u062e\u0648\u062f \u062a\u0645\u0627\u0645 \u062a\u0631\u062a\u06cc\u0628 \u06a9\u0648 \u0633\u0645\u06cc\u0679 \u0644\u06d2\u06d4 <code><bos\/><\/code> \u0627\u0648\u0631 <code><eos\/><\/code> \u0679\u0648\u06a9\u0646 \u0627\u0633 \u06a9\u0627 \u0645\u0637\u0644\u0628 \u06c1\u06d2 \u06a9\u06c1 \u062c\u0628 \u0628\u06be\u06cc \u0622\u067e \u0645\u062a\u0646 \u06a9\u0648 \u0627\u0646\u06a9\u0648\u0688 \u06a9\u0631\u062a\u06d2 \u06c1\u06cc\u06ba \u0622\u067e \u06a9\u0648 \u0627\u0633\u06d2 \u062f\u0633\u062a\u06cc \u0637\u0648\u0631 \u067e\u0631 \u0634\u0627\u0645\u0644 \u06a9\u0631\u0646\u06d2 \u06a9\u06cc \u0636\u0631\u0648\u0631\u062a \u0646\u06c1\u06cc\u06ba \u06c1\u06d2\u06d4<\/p>\n<pre><code class=\"language-python\">bos_id = tokenizer.token_to_id(\"<bos>\")\neos_id = tokenizer.token_to_id(\"<eos>\")\n\ntokenizer.post_processor = processors.TemplateProcessing(\n    single=f\"<bos>:0 $A:0 <eos>:0\",\n    pair=f\"<bos>:0 \\(A:0 <sep>:0 \\)B:1 <eos>:1\",\n    special_tokens=[\n        (\"<bos>\", bos_id),\n        (\"<eos>\", eos_id),\n        (\"<sep>\", tokenizer.token_to_id(\"<sep>\")),\n    ],\n)\n\nprint(\"Post-processor configured (auto-adds <bos> and <eos>)\")\n<\/eos><\/bos><\/sep><\/sep><\/eos><\/bos><\/eos><\/sep><\/bos><\/eos><\/bos><\/eos><\/bos><\/code><\/pre>\n<p><strong>\u0645\u06cc\u0645\u0648:<\/strong> \u0622\u067e \u0633\u0648\u0686 \u0631\u06c1\u06d2 \u06c1\u0648\u06ba \u06af\u06d2 \u06a9\u06c1 \u062c\u0628 \u0622\u067e \u067e\u06c1\u0644\u06d2 \u06c1\u06cc \u0627\u0633 \u06a9\u06cc \u0648\u0636\u0627\u062d\u062a \u06a9\u0631 \u0686\u06a9\u06d2 \u06c1\u06cc\u06ba \u062a\u0648 \u0622\u067e \u06a9\u0648 \u0627\u0633 \u0642\u062f\u0645 \u06a9\u06cc \u0636\u0631\u0648\u0631\u062a \u06a9\u06cc\u0648\u06ba \u06c1\u06d2\u06d4 <code><bos\/><\/code> \u0627\u0648\u0631 <code><eos\/><\/code> \u06a9\u0648 <code>SPECIAL_TOKENS<\/code>. \u06a9\u06c1 <code>SPECIAL_TOKENS<\/code> \u0635\u0631\u0641 \u0641\u06c1\u0631\u0633\u062a <strong>\u0630\u062e\u06cc\u0631\u06c1 \u0627\u0644\u0641\u0627\u0638 \u06a9\u06d2 \u0633\u0644\u0627\u0679<\/strong> \u0627\u0646 \u0679\u0648\u06a9\u0646\u0632 (ID \u062a\u0641\u0648\u06cc\u0636) \u06a9\u06d2 \u0644\u06cc\u06d2 \u067e\u0648\u0633\u0679 \u067e\u0631\u0648\u0633\u06cc\u0633\u0646\u06af \u0679\u0648\u06a9\u0646\u0627\u0626\u0632\u0631 \u06a9\u0648 \u06c1\u062f\u0627\u06cc\u062a \u06a9\u0631\u062a\u06cc \u06c1\u06d2 \u06a9\u06c1: <strong>\u062e\u0648\u062f \u06a9\u0627\u0631 \u0637\u0631\u06cc\u0642\u06d2 \u0633\u06d2 \u062f\u0627\u062e\u0644 \u06a9\u0631\u06cc\u06ba<\/strong> \u062a\u0645\u0627\u0645 \u0627\u0646\u06a9\u0648\u0688 \u0634\u062f\u06c1 \u062a\u0631\u062a\u06cc\u0628\u0648\u06ba \u0645\u06cc\u06ba \u0634\u0627\u0645\u0644 \u06c1\u06d2\u06d4<\/p>\n<p>\u0627\u0633 \u0642\u062f\u0645 \u06a9\u06d2 \u0628\u063a\u06cc\u0631\u060c \u0679\u0648\u06a9\u0646 \u0622\u067e \u06a9\u06d2 \u0630\u062e\u06cc\u0631\u06c1 \u0627\u0644\u0641\u0627\u0638 \u0645\u06cc\u06ba \u0645\u0648\u062c\u0648\u062f \u0631\u06c1\u06d2 \u06af\u0627\u060c \u0644\u06cc\u06a9\u0646 \u0622\u067e \u06a9\u06d2 \u0688\u06cc\u0679\u0627 \u0645\u06cc\u06ba \u0638\u0627\u06c1\u0631 \u0646\u06c1\u06cc\u06ba \u06c1\u0648\u06af\u0627 \u062c\u0628 \u062a\u06a9 \u06a9\u06c1 \u0622\u067e \u0627\u0633\u06d2 \u06c1\u0631 \u0628\u0627\u0631 \u062f\u0633\u062a\u06cc \u0637\u0648\u0631 \u067e\u0631 \u0634\u0627\u0645\u0644 \u0646\u06c1 \u06a9\u0631\u06cc\u06ba\u06d4<\/p>\n<h3 id=\"heading-testing-the-tokenizer\">\u0679\u0648\u06a9\u0646\u0627\u0626\u0632\u0631 \u0679\u06cc\u0633\u0679<\/h3>\n<p>\u0679\u0648\u06a9\u0646\u0627\u0626\u0632\u06cc\u0634\u0646 \u06a9\u0627 \u0622\u062e\u0631\u06cc \u0645\u0631\u062d\u0644\u06c1 \u062c\u0627\u0646\u0686 \u06c1\u06d2\u06d4 \u06cc\u06c1 \u0679\u06cc\u0633\u0679 \u0627\u0631\u062f\u0648 \u062c\u0645\u0644\u0648\u06ba \u06a9\u0648 \u0679\u0648\u06a9\u0646 \u0622\u0626\u06cc \u0688\u06cc\u0632 \u0645\u06cc\u06ba \u0627\u0646\u06a9\u0648\u0688 \u06a9\u0631\u062a\u0627 \u06c1\u06d2 \u0627\u0648\u0631 \u067e\u06be\u0631 \u0627\u0646 \u0622\u0626\u06cc \u0688\u06cc\u0632 \u06a9\u0648 \u062f\u0648\u0628\u0627\u0631\u06c1 \u0679\u06cc\u06a9\u0633\u0679 \u0645\u06cc\u06ba \u0688\u06cc \u06a9\u0648\u0688 \u06a9\u0631\u062a\u0627 \u06c1\u06d2\u06d4 \u0627\u06af\u0631 \u0688\u06cc \u06a9\u0648\u0688 \u0634\u062f\u06c1 \u0645\u062a\u0646 \u0627\u0635\u0644 \u0627\u0646 \u067e\u0679 \u0633\u06d2 \u0645\u06cc\u0644 \u06a9\u06be\u0627\u062a\u0627 \u06c1\u06d2 \u062a\u0648 \u0679\u0648\u06a9\u0646\u0627\u0626\u0632\u0631 \u0635\u062d\u06cc\u062d \u0637\u0631\u06cc\u0642\u06d2 \u0633\u06d2 \u06a9\u0627\u0645 \u06a9\u0631 \u0631\u06c1\u0627 \u06c1\u06d2\u06d4 \u06cc\u06c1 \u0631\u0627\u0624\u0646\u0688 \u0679\u0631\u067e \u0679\u06cc\u0633\u0679 \u0627\u0633 \u0628\u0627\u062a \u06a9\u0648 \u06cc\u0642\u06cc\u0646\u06cc \u0628\u0646\u0627\u062a\u0627 \u06c1\u06d2 \u06a9\u06c1 \u0627\u0646\u06a9\u0648\u0688\u0646\u06af \u0627\u0648\u0631 \u0688\u06cc \u06a9\u0648\u0688\u0646\u06af \u06a9\u06d2 \u062f\u0648\u0631\u0627\u0646 \u06a9\u0648\u0626\u06cc \u0628\u06be\u06cc \u0645\u0639\u0644\u0648\u0645\u0627\u062a \u0636\u0627\u0626\u0639 \u0646\u06c1 \u06c1\u0648\u06d4<\/p>\n<pre><code class=\"language-python\">test_sentences = [\n    \"\u0627\u0631\u062f\u0648 \u0627\u06cc\u06a9 \u0628\u06c1\u062a \u062e\u0648\u0628\u0635\u0648\u0631\u062a \u0632\u0628\u0627\u0646 \u06c1\u06d2\",           # \"Urdu is a very beautiful language\"\n    \"\u067e\u0627\u06a9\u0633\u062a\u0627\u0646 \u06a9\u0627 \u062f\u0627\u0631\u0627\u0644\u062d\u06a9\u0648\u0645\u062a \u0627\u0633\u0644\u0627\u0645 \u0622\u0628\u0627\u062f \u06c1\u06d2\",      # \"The capital of Pakistan is Islamabad\"\n    \"\u0622\u062c \u0645\u0648\u0633\u0645 \u0628\u06c1\u062a \u0627\u0686\u06be\u0627 \u06c1\u06d2\",                     # \"The weather is very nice today\"\n    \"\u0645\u0635\u0646\u0648\u0639\u06cc \u0630\u06c1\u0627\u0646\u062a \u0645\u0633\u062a\u0642\u0628\u0644 \u06a9\u06cc \u0679\u06cc\u06a9\u0646\u0627\u0644\u0648\u062c\u06cc \u06c1\u06d2\",     # \"AI is the technology of the future\"\n    \"\u0627\u0644\u0633\u0644\u0627\u0645 \u0639\u0644\u06cc\u06a9\u0645! \u0622\u067e \u06a9\u06cc\u0633\u06d2 \u06c1\u06cc\u06ba\u061f\",               # \"Peace be upon you! How are you?\"\n]\n\nprint(\"=\" * 70)\nprint(\"TOKENIZER TEST RESULTS\")\nprint(\"=\" * 70)\n\nfor sentence in test_sentences:\n    encoded = tokenizer.encode(sentence)\n    decoded = tokenizer.decode(encoded.ids)\n    \n    print(f\"\\n Input:    {sentence}\")\n    print(f\" Token IDs: {encoded.ids}\")\n    print(f\"  Tokens:   {encoded.tokens}\")\n    print(f\" Decoded:  {decoded}\")\n    print(f\"   Num tokens: {len(encoded.ids)}\")\n    print(f\"   Roundtrip OK: {sentence in decoded}\")\n    print(\"-\" * 70)\n<\/code><\/pre>\n<p>\u0622\u0624\u0679 \u067e\u0679 \u06c1\u06d2:<\/p>\n<pre><code class=\"language-plaintext\">======================================================================\nTOKENIZER TEST RESULTS\n======================================================================\n\n Input:    \u0627\u0631\u062f\u0648 \u0627\u06cc\u06a9 \u0628\u06c1\u062a \u062e\u0648\u0628\u0635\u0648\u0631\u062a \u0632\u0628\u0627\u0646 \u06c1\u06d2\n Token IDs: [2, 1418, 324, 431, 2965, 1430, 276, 3]\n Tokens:   ['<bos>', '\u2581\u0627\u0631\u062f\u0648', '\u2581\u0627\u06cc\u06a9', '\u2581\u0628\u06c1\u062a', '\u2581\u062e\u0648\u0628\u0635\u0648\u0631\u062a', '\u2581\u0632\u0628\u0627\u0646', '\u2581\u06c1\u06d2', '<eos>']\n Decoded:  \u0627\u0631\u062f\u0648 \u0627\u06cc\u06a9 \u0628\u06c1\u062a \u062e\u0648\u0628\u0635\u0648\u0631\u062a \u0632\u0628\u0627\u0646 \u06c1\u06d2\n   Num tokens: 8\n   Roundtrip OK: True\n----------------------------------------------------------------------\n\n Input:    \u067e\u0627\u06a9\u0633\u062a\u0627\u0646 \u06a9\u0627 \u062f\u0627\u0631\u0627\u0644\u062d\u06a9\u0648\u0645\u062a \u0627\u0633\u0644\u0627\u0645 \u0622\u0628\u0627\u062f \u06c1\u06d2\n Token IDs: [2, 474, 289, 3699, 616, 1004, 276, 3]\n Tokens:   ['<bos>', '\u2581\u067e\u0627\u06a9\u0633\u062a\u0627\u0646', '\u2581\u06a9\u0627', '\u2581\u062f\u0627\u0631\u0627\u0644\u062d\u06a9\u0648\u0645\u062a', '\u2581\u0627\u0633\u0644\u0627\u0645', '\u2581\u0622\u0628\u0627\u062f', '\u2581\u06c1\u06d2', '<eos>']\n Decoded:  \u067e\u0627\u06a9\u0633\u062a\u0627\u0646 \u06a9\u0627 \u062f\u0627\u0631\u0627\u0644\u062d\u06a9\u0648\u0645\u062a \u0627\u0633\u0644\u0627\u0645 \u0622\u0628\u0627\u062f \u06c1\u06d2\n   Num tokens: 8\n   Roundtrip OK: True\n<\/eos><\/bos><\/eos><\/bos><\/code><\/pre>\n<p>\u06a9\u0633 \u0637\u0631\u062d \u062a\u0648\u062c\u06c1 \u062f\u06cc\u0646\u0627 <code><bos\/><\/code> \u0627\u0648\u0631 <code><eos\/><\/code> (\u067e\u0648\u0633\u0679 \u067e\u0631\u0648\u0633\u06cc\u0633\u0646\u06af \u0633\u0679\u06cc\u067e \u06a9\u0627 \u0634\u06a9\u0631\u06cc\u06c1) \u062f\u0631\u062c \u0630\u06cc\u0644 \u0639\u0627\u0645 \u0627\u0631\u062f\u0648 \u0627\u0644\u0641\u0627\u0638 \u062e\u0648\u062f \u0628\u062e\u0648\u062f \u0634\u0627\u0645\u0644 \u06c1\u0648 \u062c\u0627\u062a\u06d2 \u06c1\u06cc\u06ba: <code>\u067e\u0627\u06a9\u0633\u062a\u0627\u0646<\/code> \u0627\u06cc\u06a9 \u0648\u0627\u062d\u062f \u0679\u0648\u06a9\u0646 \u06a9\u06d2 \u0637\u0648\u0631 \u067e\u0631 \u0628\u0631\u0642\u0631\u0627\u0631 \u0631\u06a9\u06be\u0627 <code>\u2581<\/code> \u0645\u06cc\u0679\u0627 \u0627\u0633\u067e\u06cc\u0633 \u0644\u063a\u062a \u0679\u0648\u06a9\u0646\u0627\u0626\u0632\u0631 \u06a9\u06d2 \u0644\u06cc\u06d2 \u0633\u0627\u0628\u0642\u06c1 \u200b\u200b\u0627\u0644\u0641\u0627\u0638 \u06a9\u06cc \u062d\u062f\u0648\u062f \u06a9\u0648 \u0646\u0634\u0627\u0646 \u0632\u062f \u06a9\u0631\u062a\u0627 \u06c1\u06d2\u06d4 \u0633\u0628 \u0633\u06d2 \u0627\u06c1\u0645 \u0628\u0627\u062a \u06cc\u06c1 \u06c1\u06d2 \u06a9\u06c1 \u062a\u0645\u0627\u0645 \u0631\u0627\u0624\u0646\u0688 \u0679\u0631\u067e \u06a9\u0627\u0645\u06cc\u0627\u0628 \u06c1\u0648\u062a\u06d2 \u06c1\u06cc\u06ba\u06d4 \u0627\u0633 \u06a9\u0627 \u0645\u0637\u0644\u0628 \u06c1\u06d2 \u06a9\u06c1 \u0688\u06cc \u06a9\u0648\u0688 \u0634\u062f\u06c1 \u0645\u062a\u0646 \u0627\u0635\u0644 \u0627\u0646 \u067e\u0679 \u0633\u06d2 \u0628\u0627\u0644\u06a9\u0644 \u0645\u06cc\u0644 \u06a9\u06be\u0627\u062a\u0627 \u06c1\u06d2\u06d4<\/p>\n<h3 id=\"heading-fertility-score\">\u0632\u0631\u062e\u06cc\u0632\u06cc \u0633\u06a9\u0648\u0631<\/h3>\n<p>\u0632\u0631\u062e\u06cc\u0632\u06cc \u0641\u06cc \u0644\u0641\u0638 \u0679\u0648\u06a9\u0646 \u06a9\u06cc \u0627\u0648\u0633\u0637 \u062a\u0639\u062f\u0627\u062f \u06c1\u06d2\u06d4<\/p>\n<ul>\n<li>\n<p>1 \u06a9\u06cc \u0627\u0641\u0627\u062f\u06cc\u062a \u06a9\u0627 \u0645\u0637\u0644\u0628 \u06c1\u06d2 \u06a9\u06c1 \u06c1\u0631 \u0644\u0641\u0638 \u0627\u06cc\u06a9 \u0679\u0648\u06a9\u0646 \u067e\u0631 \u0646\u0642\u0634\u06c1 \u0628\u0646\u0627\u062a\u0627 \u06c1\u06d2 (\u062c\u062f\u06cc\u062f \u0633\u0628 \u0648\u0631\u0688 \u0679\u0648\u06a9\u0646\u0627\u0626\u0632\u0631\u0632 \u0645\u06cc\u06ba \u0645\u062b\u0627\u0644\u06cc \u0644\u06cc\u06a9\u0646 \u063a\u06cc\u0631 \u062d\u0642\u06cc\u0642\u06cc)\u06d4<\/p>\n<\/li>\n<li>\n<p>\u062c\u062f\u06cc\u062f LLMs \u0645\u06cc\u06ba\u060c \u0632\u0628\u0627\u0646 \u06a9\u06d2 \u0644\u062d\u0627\u0638 \u0633\u06d2 \u0634\u0631\u062d \u067e\u06cc\u062f\u0627\u0626\u0634 \u0639\u0627\u0645 \u0637\u0648\u0631 \u067e\u0631 1.3-2.5 \u06a9\u06d2 \u0644\u06af \u0628\u06be\u06af \u06c1\u0648\u062a\u06cc \u06c1\u06d2\u06d4<\/p>\n<\/li>\n<li>\n<p>\u0632\u06cc\u0627\u062f\u06c1 \u067e\u06cc\u062f\u0627\u0648\u0627\u0631\u06cc \u0635\u0644\u0627\u062d\u06cc\u062a \u06a9\u0627 \u0645\u0637\u0644\u0628 \u06c1\u06d2 \u0632\u06cc\u0627\u062f\u06c1 \u0679\u0648\u06a9\u0646 \u0627\u0633\u067e\u0644\u0679\u060c \u062c\u0648 \u0644\u0627\u06af\u062a \u06a9\u0648 \u0628\u0691\u06be\u0627\u062a\u0627 \u06c1\u06d2 \u0627\u0648\u0631 \u06a9\u0627\u0631\u06a9\u0631\u062f\u06af\u06cc \u06a9\u0648 \u06a9\u0645 \u06a9\u0631\u062a\u0627 \u06c1\u06d2\u060c \u0644\u06cc\u06a9\u0646 \u0632\u0628\u0627\u0646 \u06a9\u06cc \u067e\u06cc\u0686\u06cc\u062f\u06af\u06cc \u06a9\u06d2 \u0633\u0627\u062a\u06be \u0633\u0627\u062a\u06be \u0679\u0648\u06a9\u0646\u0627\u0626\u0632\u0631 \u06a9\u06d2 \u0645\u0639\u06cc\u0627\u0631 \u0633\u06d2 \u0628\u06be\u06cc \u0645\u062a\u0627\u062b\u0631 \u06c1\u0648\u062a\u0627 \u06c1\u06d2\u06d4<\/p>\n<\/li>\n<\/ul>\n<pre><code class=\"language-python\"># ============================================================\n# Calculate fertility score on training corpus\n# ============================================================\nimport json\n\njsonl_file = CLEANED_DIR \/ \"urdu_corpus.jsonl\"\ncorpus_words = 0\ncorpus_tokens = 0\nsample_size = 10000  # Sample 10K documents for speed\n\nprint(f\"Calculating fertility on {sample_size:,} documents from corpus...\")\n\nwith open(jsonl_file, \"r\", encoding=\"utf-8\") as f:\n    for i, line in enumerate(f):\n        if i >= sample_size:\n            break\n        doc = json.loads(line)\n        text = doc[\"text\"]\n        \n        words = text.split()\n        tokens = tokenizer.encode(text).tokens\n        n_tokens = len(tokens) - 2  # Remove <bos> and <eos>\n        \n        corpus_words += len(words)\n        corpus_tokens += n_tokens\n\ncorpus_fertility = corpus_tokens \/ corpus_words\nprint(f\"\\n&#x1f4ca; Fertility Score (corpus): {corpus_fertility:.2f} tokens\/word\")\nprint(f\"   (Total: {corpus_words:,} words \u2192 {corpus_tokens:,} tokens)\")\nprint(f\"   Documents sampled: {min(i+1, sample_size):,}\")\n\nif corpus_fertility < 2.0:\n    print(\"   &#x2705; Excellent! Tokenizer is well-optimized for Urdu.\")\nelif corpus_fertility < 3.0:\n    print(\"   &#x26a0; Good, but could be better. Consider larger vocab.\")\nelse:\n    print(\"   &#x274c; High fertility. The tokenizer needs improvement.\")\n<\/eos><\/bos><\/code><\/pre>\n<p>\u06c1\u0645\u06cc\u06ba \u06cc\u06c1\u0627\u06ba \u062d\u0627\u0635\u0644 \u06c1\u0648\u0646\u06d2 \u0648\u0627\u0644\u0627 \u0632\u0631\u062e\u06cc\u0632\u06cc \u0633\u06a9\u0648\u0631 1.04 \u06c1\u06d2\u060c \u062c\u0648 \u0628\u06c1\u062a \u0627\u0686\u06be\u0627 \u06c1\u06d2\u06d4 \u062a\u0627\u06c1\u0645\u060c \u0630\u06c1\u0646 \u0645\u06cc\u06ba \u0631\u06a9\u06be\u06cc\u06ba \u06a9\u06c1 \u06cc\u06c1 \u062a\u0639\u062f\u0627\u062f \u0645\u0635\u0646\u0648\u0639\u06cc \u0637\u0648\u0631 \u067e\u0631 \u06a9\u0645 \u06c1\u06d2 \u06a9\u06cc\u0648\u0646\u06a9\u06c1 \u0679\u0648\u06a9\u0646\u0627\u0626\u0632\u0631 \u06a9\u0648 \u0627\u0633\u06cc \u0686\u06be\u0648\u0679\u06d2 \u06a9\u0627\u0631\u067e\u0633 \u067e\u0631 \u062a\u0631\u0628\u06cc\u062a \u062f\u06cc \u06af\u0626\u06cc \u062a\u06be\u06cc \u062c\u0633 \u067e\u0631 \u0627\u0633 \u06a9\u0627 \u062c\u0627\u0626\u0632\u06c1 \u0644\u06cc\u0627 \u06af\u06cc\u0627 \u062a\u06be\u0627\u06d4 \u0628\u0691\u06d2 \u06cc\u0627 \u067e\u0648\u0634\u06cc\u062f\u06c1 \u0688\u06cc\u0679\u0627 \u0633\u06cc\u0679\u0633 \u06a9\u06d2 \u0627\u0633\u062a\u0639\u0645\u0627\u0644 \u06a9\u06d2 \u0646\u062a\u06cc\u062c\u06d2 \u0645\u06cc\u06ba \u0632\u0631\u062e\u06cc\u0632\u06cc \u06a9\u06cc \u0634\u0631\u062d \u0632\u06cc\u0627\u062f\u06c1 \u06c1\u0648\u0646\u06d2 \u06a9\u0627 \u0627\u0645\u06a9\u0627\u0646 \u06c1\u06d2 (\u067e\u0631\u0648\u0688\u06a9\u0634\u0646 \u0679\u0648\u06a9\u0646\u0627\u0626\u0632\u0631\u0632 \u06a9\u06d2 \u0644\u06cc\u06d2 \u0639\u0627\u0645 1.3-2.5 \u0631\u06cc\u0646\u062c \u06a9\u06d2 \u0642\u0631\u06cc\u0628)\u06d4<\/p>\n<h3 id=\"heading-saving-the-tokenizer\">\u0679\u0648\u06a9\u0646\u0627\u0626\u0632\u0631 \u06a9\u0648 \u0645\u062d\u0641\u0648\u0638 \u06a9\u0631\u06cc\u06ba\u06d4<\/h3>\n<p>\u0622\u062e\u0631\u06cc \u0645\u0631\u062d\u0644\u06c1 \u0679\u0648\u06a9\u0646\u0627\u0626\u0632\u0631 \u06a9\u0648 JSON \u0641\u0627\u0631\u0645\u06cc\u0679 \u0645\u06cc\u06ba \u0645\u062d\u0641\u0648\u0638 \u06a9\u0631\u0646\u0627 \u0627\u0648\u0631 \u0627\u0633 \u0628\u0627\u062a \u06a9\u0648 \u06cc\u0642\u06cc\u0646\u06cc \u0628\u0646\u0627\u0646\u0627 \u06c1\u06d2 \u06a9\u06c1 \u06cc\u06c1 \u0635\u062d\u06cc\u062d \u0637\u0631\u06cc\u0642\u06d2 \u0633\u06d2 \u0644\u0648\u0688 \u06c1\u0648\u06d4<\/p>\n<pre><code class=\"language-python\"># ============================================================\n# Save the tokenizer\n# ============================================================\n\ntokenizer_path = str(TOKENIZER_DIR \/ \"urdu_bpe_tokenizer.json\")\ntokenizer.save(tokenizer_path)\n\nprint(f\" Tokenizer saved to: {tokenizer_path}\")\nprint(f\"   File size: {os.path.getsize(tokenizer_path) \/ 1024:.0f} KB\")\n\n# Verify we can load it back\nloaded_tokenizer = Tokenizer.from_file(tokenizer_path)\ntest = loaded_tokenizer.encode(\"\u0627\u0631\u062f\u0648 \u0627\u06cc\u06a9 \u062e\u0648\u0628\u0635\u0648\u0631\u062a \u0632\u0628\u0627\u0646 \u06c1\u06d2\")\nprint(f\"\\n   Verification: {test.tokens}\")\nprint(f\"    Tokenizer loads correctly!\")\n<\/code><\/pre>\n<p>\u0627\u06cc\u06a9 \u0628\u0627\u0631 \u0645\u062d\u0641\u0648\u0638 \u06c1\u0648\u0646\u06d2 \u06a9\u06d2 \u0628\u0639\u062f\u060c \u0622\u067e \u06a9\u06d2 \u067e\u0627\u0633 \u062a\u0644\u0627\u0634 \u06a9\u0631\u0646\u06d2 \u06a9\u06cc \u0645\u06cc\u0632 \u06c1\u0648\u06af\u06cc\u06d4 \u0622\u067e \u0627\u0633\u06d2 \u0627\u067e\u0646\u06d2 \u0688\u06cc\u0679\u0627 \u0627\u06a9\u0679\u06be\u0627 \u06a9\u0631\u0646\u06d2 \u06a9\u06d2 \u0633\u0627\u062a\u06be \u062f\u0631\u062c \u0630\u06cc\u0644 \u0627\u06c1\u0645 \u0627\u0642\u062f\u0627\u0645\u0627\u062a \u06a9\u0631\u0646\u06d2 \u06a9\u06d2 \u0644\u06cc\u06d2 \u0627\u0633\u062a\u0639\u0645\u0627\u0644 \u06a9\u0631 \u0633\u06a9\u062a\u06d2 \u06c1\u06cc\u06ba\u06d4 <strong>\u067e\u06cc\u0634\u06af\u06cc \u062a\u0631\u0628\u06cc\u062a<\/strong>.<\/p>\n<h2 id=\"heading-3-pre-training\">3. \u067e\u0631\u06cc \u0679\u0631\u06cc\u0646\u0646\u06af<\/h2>\n<p>\u0627\u0633 \u062d\u0635\u06d2 \u0645\u06cc\u06ba\u060c \u0645\u0627\u0688\u0644 \u0632\u0628\u0627\u0646\u060c \u06af\u0631\u0627\u0645\u0631\u060c \u067e\u06cc\u0679\u0631\u0646\u060c \u0627\u0648\u0631 \u0627\u0644\u0641\u0627\u0638 \u0633\u06cc\u06a9\u06be\u062a\u0627 \u06c1\u06d2\u06d4 \u062a\u0631\u0628\u06cc\u062a \u0645\u06a9\u0645\u0644 \u06c1\u0648\u0646\u06d2 \u06a9\u06d2 \u0628\u0639\u062f\u060c \u0645\u0627\u0688\u0644 \u062a\u0631\u062a\u06cc\u0628 \u0645\u06cc\u06ba \u0627\u06af\u0644\u06d2 \u0644\u0641\u0638 \u06a9\u06cc \u067e\u06cc\u0634\u06cc\u0646 \u06af\u0648\u0626\u06cc \u06a9\u0631 \u0633\u06a9\u062a\u0627 \u06c1\u06d2\u060c \u0627\u0648\u0631 \u06cc\u06c1\u06cc\u06ba \u0633\u06d2 \u06c1\u0645 \u062e\u0627\u0645 \u0688\u06cc\u0679\u0627 \u06a9\u0648 LLM \u0645\u06cc\u06ba \u062a\u0628\u062f\u06cc\u0644 \u06c1\u0648\u062a\u06d2 \u062f\u06cc\u06a9\u06be\u0646\u0627 \u0634\u0631\u0648\u0639 \u06a9\u0631 \u062f\u06cc\u062a\u06d2 \u06c1\u06cc\u06ba\u06d4<\/p>\n<p><strong>LLM \u062f\u0631\u0627\u0635\u0644 \u0627\u06af\u0644\u0627 \u0644\u0641\u0638 \u067e\u06cc\u0634\u06cc\u0646 \u06af\u0648 \u06c1\u06d2\u06d4<\/strong> \u0627\u0644\u0641\u0627\u0638 \u06a9\u06d2 \u0627\u06cc\u06a9 \u0633\u06cc\u0679 \u06a9\u0648 \u062f\u06cc\u06a9\u06be\u062a\u06d2 \u06c1\u0648\u0626\u06d2\u060c \u06cc\u06c1 \u0645\u0645\u06a9\u0646\u06c1 \u0637\u0648\u0631 \u067e\u0631 \u0627\u06af\u0644\u06d2 \u0644\u0641\u0638 \u06a9\u06cc \u067e\u06cc\u0634 \u06af\u0648\u0626\u06cc \u06a9\u0631\u062a\u0627 \u06c1\u06d2\u06d4<\/p>\n<p>\u062a\u0631\u0628\u06cc\u062a \u06a9\u06d2 \u0630\u0631\u06cc\u0639\u06d2\u060c \u0645\u0627\u0688\u0644 \u0633\u06cc\u06a9\u06be\u062a\u0627 \u06c1\u06d2:<\/p>\n<ul>\n<li>\n<p>\u0632\u0628\u0627\u0646 \u06a9\u06cc \u062a\u0631\u06a9\u06cc\u0628<\/p>\n<\/li>\n<li>\n<p>\u0633\u06cc\u0645\u0646\u0679\u06a9\u0633\u060c \u0633\u06cc\u0627\u0642 \u0648 \u0633\u0628\u0627\u0642 \u06a9\u06d2 \u0645\u0639\u0646\u06cc<\/p>\n<\/li>\n<li>\n<p>\u06a9\u062b\u0631\u062a \u0633\u06d2 \u0627\u0633\u062a\u0639\u0645\u0627\u0644 \u06c1\u0648\u0646\u06d2 \u0648\u0627\u0644\u06d2 \u062a\u0627\u062b\u0631\u0627\u062a<\/p>\n<\/li>\n<li>\n<p>\u062a\u0631\u0628\u06cc\u062a\u06cc \u0688\u06cc\u0679\u0627 \u0633\u06cc\u0679 \u0633\u06d2 \u062d\u0642\u0627\u0626\u0642<\/p>\n<\/li>\n<\/ul>\n<p>\u062a\u0631\u0628\u06cc\u062a \u06a9\u06d2 \u0644\u06cc\u06d2 \u06a9\u0626\u06cc \u0627\u062e\u062a\u06cc\u0627\u0631\u0627\u062a \u06c1\u06cc\u06ba\u06d4 \u0686\u0648\u0646\u06a9\u06c1 \u0645\u0627\u0688\u0644 \u0686\u06be\u0648\u0679\u0627 \u06c1\u06d2\u060c \u0622\u067e \u0627\u0633\u06d2 \u0627\u067e\u0646\u06cc \u0645\u0642\u0627\u0645\u06cc \u0645\u0634\u06cc\u0646 \u067e\u0631 \u062a\u0631\u0628\u06cc\u062a \u062f\u06d2 \u0633\u06a9\u062a\u06d2 \u06c1\u06cc\u06ba\u06d4 \u06cc\u06c1 \u0633\u0633\u062a \u06c1\u06d2\u060c \u0644\u06cc\u06a9\u0646 \u06cc\u06c1 \u06a9\u0627\u0645 \u06c1\u0648 \u062c\u0627\u062a\u0627 \u06c1\u06d2.<\/p>\n<p>\u062f\u0648\u0633\u0631\u0627 \u0622\u067e\u0634\u0646 \u06af\u0648\u06af\u0644 \u06a9\u0648\u0644\u0627\u0628 \u0627\u0633\u062a\u0639\u0645\u0627\u0644 \u06a9\u0631\u0646\u0627 \u06c1\u06d2\u06d4 \u06cc\u06c1 \u0648\u06c1\u06cc \u06c1\u06d2 \u062c\u0648 \u0645\u06cc\u06ba \u0646\u06d2 \u0627\u0633\u062a\u0639\u0645\u0627\u0644 \u06a9\u06cc\u0627: \u0645\u0641\u062a \u0648\u0631\u0698\u0646 T4 GPU \u06a9\u0627 \u0627\u0633\u062a\u0639\u0645\u0627\u0644 \u06a9\u0631\u062a\u06d2 \u06c1\u0648\u0626\u06d2 \u0645\u06cc\u0631\u06cc \u062a\u0631\u0628\u06cc\u062a \u06a9\u06cc \u0636\u0631\u0648\u0631\u06cc\u0627\u062a \u06a9\u06d2 \u0644\u0626\u06d2 \u06a9\u0627\u0641\u06cc \u062a\u06be\u0627\u06d4<\/p>\n<h3 id=\"heading-steps-to-do-pre-training\">\u067e\u0631\u06cc \u0679\u0631\u06cc\u0646\u0646\u06af \u0627\u0646\u062c\u0627\u0645 \u062f\u06cc\u0646\u06d2 \u06a9\u06d2 \u0627\u0642\u062f\u0627\u0645\u0627\u062a<\/h3>\n<ol>\n<li>\n<p>\u0688\u06cc\u0679\u0627 \u0633\u06cc\u0679 JSONL \u0641\u0627\u0626\u0644 \u0627\u0648\u0631 \u0679\u0648\u06a9\u0646\u0627\u0626\u0632\u0631 \u06a9\u0648 \u06af\u0648\u06af\u0644 \u0688\u0631\u0627\u0626\u06cc\u0648 \u067e\u0631 \u0627\u067e \u0644\u0648\u0688 \u06a9\u0631\u06cc\u06ba\u06d4<\/p>\n<\/li>\n<li>\n<p>\u0645\u0627\u0688\u0644 \u06a9\u0646\u0641\u06cc\u06af\u0631\u06cc\u0634\u0646 \u0633\u06cc\u0679 \u06a9\u0631\u06cc\u06ba (\u0627\u0644\u0641\u0627\u0638 \u06a9\u0627 \u0633\u0627\u0626\u0632\u060c \u067e\u0631\u062a\u06cc\u06ba\u060c \u0633\u0631\u060c \u0648\u063a\u06cc\u0631\u06c1)\u06d4<\/p>\n<\/li>\n<li>\n<p>\u06a9\u0646\u0648\u0631\u0679\u0631 \u0641\u0646 \u062a\u0639\u0645\u06cc\u0631 \u06a9\u06cc \u0648\u0636\u0627\u062d\u062a \u06a9\u0631\u06cc\u06ba (\u062a\u0648\u062c\u06c1\u060c \u0641\u06cc\u0688 \u0641\u0627\u0631\u0648\u0631\u0688\u060c \u0628\u0644\u0627\u06a9)\u06d4<\/p>\n<\/li>\n<li>\n<p>\u06a9\u0627\u0631\u067e\u0633 \u06a9\u0648 \u062a\u0631\u0628\u06cc\u062a\/\u062a\u0648\u062b\u06cc\u0642 \u06a9\u06cc \u062a\u0642\u0633\u06cc\u0645 \u0645\u06cc\u06ba \u0644\u0648\u0688 \u06a9\u0631\u06cc\u06ba \u0627\u0648\u0631 \u0627\u0633\u06d2 \u0679\u0648\u06a9\u0646\u0627\u0626\u0632 \u06a9\u0631\u06cc\u06ba\u06d4<\/p>\n<\/li>\n<li>\n<p>\u0622\u067e\u0679\u06cc\u0645\u0627\u0626\u0632\u0631\u060c LR \u0634\u06cc\u0688\u0648\u0644 \u0627\u0648\u0631 \u0686\u06cc\u06a9 \u067e\u0648\u0627\u0626\u0646\u0679\u0633 \u06a9\u0627 \u0627\u0633\u062a\u0639\u0645\u0627\u0644 \u06a9\u0631\u062a\u06d2 \u06c1\u0648\u0626\u06d2 \u0679\u0631\u06cc\u0646\u0646\u06af \u0644\u0648\u067e \u0686\u0644\u0627\u0626\u06cc\u06ba\u06d4<\/p>\n<\/li>\n<\/ol>\n<h3 id=\"heading-model-configuration\">\u0645\u0627\u0688\u0644 \u06a9\u06cc \u062a\u0631\u062a\u06cc\u0628<\/h3>\n<pre><code class=\"language-python\">from dataclasses import dataclass\n\n@dataclass\nclass UrduLLMConfig:\n    # Vocabulary\n    vocab_size: int = 32_000\n    pad_token_id: int = 0\n    bos_token_id: int = 2\n    eos_token_id: int = 3\n\n    # Model Architecture\n    d_model: int = 384\n    n_layers: int = 6\n    n_heads: int = 6\n    d_ff: int = 1536  # 4 * d_model\n    dropout: float = 0.1\n    max_seq_len: int = 256\n\n    # Training\n    batch_size: int = 32\n    learning_rate: float = 3e-4\n    weight_decay: float = 0.1\n    max_epochs: int = 10\n    warmup_steps: int = 500\n    grad_clip: float = 1.0\n<\/code><\/pre>\n<h4 id=\"heading-configuration-parameters-explained\">\u06a9\u0646\u0641\u06cc\u06af\u0631\u06cc\u0634\u0646 \u067e\u06cc\u0631\u0627\u0645\u06cc\u0679\u0631 \u06a9\u06cc \u062a\u0641\u0635\u06cc\u0644:<\/h4>\n<p>\u0644\u063a\u0648\u06cc \u067e\u06cc\u0631\u0627\u0645\u06cc\u0679\u0631\u0632 (<code>vocab_size<\/code>, <code>pad_token_id<\/code>, <code>bos_token_id<\/code>, <code>eos_token_id<\/code>) \u0628\u0633 \u0627\u0633\u06d2 \u0627\u0633 \u0679\u0648\u06a9\u0646\u0627\u0626\u0632\u0631 \u0633\u06d2 \u062c\u0648\u0691\u06cc\u06ba \u062c\u0648 \u0622\u067e \u0646\u06d2 \u067e\u06c1\u0644\u06d2 \u0628\u0646\u0627\u06cc\u0627 \u062a\u06be\u0627\u06d4 <code>vocab_size<\/code> 32K (BPE \u0630\u062e\u06cc\u0631\u06c1 \u0627\u0644\u0641\u0627\u0638) \u06c1\u06d2 \u0627\u0648\u0631 \u062e\u0635\u0648\u0635\u06cc \u0679\u0648\u06a9\u0646 IDs (0, 2, 3) \u0679\u0648\u06a9\u0646\u0627\u0626\u0632\u0631 \u06a9\u06cc \u062a\u0631\u0628\u06cc\u062a \u06a9\u06d2 \u062f\u0648\u0631\u0627\u0646 \u062a\u0641\u0648\u06cc\u0636 \u06a9\u0631\u062f\u06c1 \u0639\u06c1\u062f\u0648\u06ba \u06a9\u06d2 \u0645\u0637\u0627\u0628\u0642 \u06c1\u06cc\u06ba\u06d4<\/p>\n<h4 id=\"heading-model-architecture-parameters\">\u0645\u0627\u0688\u0644 \u0641\u0646 \u062a\u0639\u0645\u06cc\u0631 \u06a9\u06d2 \u067e\u06cc\u0631\u0627\u0645\u06cc\u0679\u0631\u0632:<\/h4>\n<table>\n<thead>\n<tr>\n<th>\u0645\u062a\u063a\u06cc\u0631<\/th>\n<th>\u0627\u0633 \u06a9\u0627 \u06a9\u06cc\u0627 \u0645\u0637\u0644\u0628 \u06c1\u06d2<\/th>\n<th>\u06c1\u0627\u06ba<\/th>\n<th>\u0627\u0642\u062f\u0627\u0631 \u06a9\u0627 \u0627\u062b\u0631<\/th>\n<\/tr>\n<\/thead>\n<tbody>\n<tr>\n<td><code>d_model<\/code><\/td>\n<td>\u0627\u06cc\u0645\u0628\u06cc\u0688\u0646\u06af\/\u0648\u06cc\u06a9\u0679\u0631 \u0633\u0627\u0626\u0632 \u0641\u06cc \u0679\u0648\u06a9\u0646<\/td>\n<td>384<\/td>\n<td>\u0627\u0639\u0644\u06cc: \u0628\u06c1\u062a\u0631 \u0641\u06c1\u0645\u060c \u0644\u06cc\u06a9\u0646 \u0633\u0633\u062a \u0631\u0641\u062a\u0627\u0631 \u0627\u0648\u0631 \u0632\u06cc\u0627\u062f\u06c1 \u0645\u06cc\u0645\u0648\u0631\u06cc\u06d4 \u0644\u0648: \u062a\u06cc\u0632\u060c \u0644\u06cc\u06a9\u0646 \u06a9\u0645 \u0627\u0638\u06c1\u0627\u0631 \u062e\u06cc\u0627\u0644\u06d4<\/td>\n<\/tr>\n<tr>\n<td><code>n_layers<\/code><\/td>\n<td>\u0679\u0631\u0627\u0646\u0633\u0641\u0627\u0631\u0645\u0631 \u06a9\u06cc \u062a\u06c1\u0648\u06ba \u06a9\u06cc \u062a\u0639\u062f\u0627\u062f<\/td>\n<td>6<\/td>\n<td>\u0645\u0632\u06cc\u062f \u067e\u0631\u062a\u06cc\u06ba \u0628\u06c1\u062a\u0631 \u062a\u0641\u06c1\u06cc\u0645 \u0641\u0631\u0627\u06c1\u0645 \u06a9\u0631\u062a\u06cc \u06c1\u06cc\u06ba\u060c \u0644\u06cc\u06a9\u0646 \u0637\u0648\u06cc\u0644 \u062a\u0627\u062e\u06cc\u0631\u06d4 \u06a9\u0645: \u062a\u06cc\u0632 \u0644\u06cc\u06a9\u0646 \u06a9\u0645 \u0637\u0627\u0642\u062a\u0648\u0631<\/td>\n<\/tr>\n<tr>\n<td><code>n_heads<\/code><\/td>\n<td>\u062f\u06be\u06cc\u0627\u0646 \u0633\u0631 \u0641\u06cc \u067e\u0631\u062a<\/td>\n<td>6<\/td>\n<td>\u0645\u0632\u06cc\u062f \u0633\u0631: \u0628\u06c1\u062a\u0631 \u0633\u06cc\u0627\u0642 \u0648 \u0633\u0628\u0627\u0642 \u06a9\u06cc \u06af\u0631\u0641\u062a\u06d4 \u0628\u06c1\u062a \u06a9\u0645: \u0631\u06cc\u0627\u0633\u062a\u06cc \u062a\u0646\u0648\u0639 \u0645\u062d\u062f\u0648\u062f \u06c1\u06d2\u06d4<\/td>\n<\/tr>\n<tr>\n<td><code>d_ff<\/code><\/td>\n<td>\u0641\u06cc\u0688 \u0641\u0627\u0631\u0648\u0631\u0688 \u067e\u0631\u062a \u06a9\u0627 \u0633\u0627\u0626\u0632<\/td>\n<td>1536<\/td>\n<td>\u0628\u0691\u0627: \u0632\u06cc\u0627\u062f\u06c1 \u06a9\u0645\u067e\u06cc\u0648\u0679\u06cc\u0634\u0646\u0644 \u067e\u0627\u0648\u0631\u06d4 \u0686\u06be\u0648\u0679\u0627: \u062a\u06cc\u0632 \u0644\u06cc\u06a9\u0646 \u06a9\u0645\u0632\u0648\u0631 \u062a\u0628\u062f\u06cc\u0644\u06cc<\/td>\n<\/tr>\n<tr>\n<td><code>dropout<\/code><\/td>\n<td>\u062a\u0631\u0628\u06cc\u062a \u06a9\u06d2 \u062f\u0648\u0631\u0627\u0646 % \u0646\u06cc\u0648\u0631\u0627\u0646 \u062d\u0630\u0641 \u06c1\u0648 \u06af\u0626\u06d2\u06d4<\/td>\n<td>0.1<\/td>\n<td>\u0632\u06cc\u0627\u062f\u06c1: \u0632\u06cc\u0627\u062f\u06c1 \u0641\u0679\u0646\u06af \u06a9\u0648 \u0631\u0648\u06a9\u062a\u0627 \u06c1\u06d2\u060c \u0644\u06cc\u06a9\u0646 \u0627\u0633 \u06a9\u06d2 \u0646\u062a\u06cc\u062c\u06d2 \u0645\u06cc\u06ba \u06a9\u0645 \u0641\u0679\u0646\u06af \u06c1\u0648 \u0633\u06a9\u062a\u06cc \u06c1\u06d2\u06d4 \u06a9\u0645: \u062a\u0631\u0628\u06cc\u062a \u06a9\u06d2 \u0644\u06cc\u06d2 \u0627\u0686\u06be\u0627\u060c \u0644\u06cc\u06a9\u0646 \u0632\u06cc\u0627\u062f\u06c1 \u0641\u0679\u0646\u06af \u06a9\u0627 \u062e\u0637\u0631\u06c1<\/td>\n<\/tr>\n<tr>\n<td><code>max_seq_len<\/code><\/td>\n<td>\u0632\u06cc\u0627\u062f\u06c1 \u0633\u06d2 \u0632\u06cc\u0627\u062f\u06c1 \u0679\u0648\u06a9\u0646 \u0641\u06cc \u0627\u0646 \u067e\u0679<\/td>\n<td>256<\/td>\n<td>\u0627\u0639\u0644\u06cc: \u0632\u06cc\u0627\u062f\u06c1 \u0633\u06cc\u0627\u0642 \u0648 \u0633\u0628\u0627\u0642\u060c \u0644\u06cc\u06a9\u0646 \u0633\u0633\u062a \u0627\u0648\u0631 \u0632\u06cc\u0627\u062f\u06c1 \u0645\u06c1\u0646\u06af\u0627\u06d4 \u06a9\u0645: \u062a\u06cc\u0632\u060c \u0644\u06cc\u06a9\u0646 \u0633\u06cc\u0627\u0642 \u0648 \u0633\u0628\u0627\u0642 \u0645\u062d\u062f\u0648\u062f<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<h4 id=\"heading-training-hyperparameters\">\u0679\u0631\u06cc\u0646\u0646\u06af \u06c1\u0627\u0626\u067e\u0631\u067e\u06cc\u0631\u0627\u0645\u06cc\u0679\u0631:<\/h4>\n<table>\n<thead>\n<tr>\n<th>\u0645\u062a\u063a\u06cc\u0631<\/th>\n<th>\u0627\u0633 \u06a9\u0627 \u06a9\u06cc\u0627 \u0645\u0637\u0644\u0628 \u06c1\u06d2<\/th>\n<th>\u06c1\u0627\u06ba<\/th>\n<th>\u0627\u0642\u062f\u0627\u0631 \u06a9\u0627 \u0627\u062b\u0631<\/th>\n<\/tr>\n<\/thead>\n<tbody>\n<tr>\n<td><code>batch_size<\/code><\/td>\n<td>\u0646\u0645\u0648\u0646\u06d2 \u0641\u06cc \u062a\u0631\u0628\u06cc\u062a\u06cc \u0642\u062f\u0645<\/td>\n<td>32<\/td>\n<td>\u0628\u0691\u0627: \u062a\u0631\u0628\u06cc\u062a \u062a\u06cc\u0632 \u06c1\u0648\u062a\u06cc \u06c1\u06d2 \u0644\u06cc\u06a9\u0646 \u0627\u0633 \u06a9\u06d2 \u0644\u06cc\u06d2 \u0632\u06cc\u0627\u062f\u06c1 \u0645\u06cc\u0645\u0648\u0631\u06cc \u06a9\u06cc \u0636\u0631\u0648\u0631\u062a \u06c1\u0648\u062a\u06cc \u06c1\u06d2\u06d4 \u0686\u06be\u0648\u0679\u0627: \u0645\u0633\u062a\u062d\u06a9\u0645 \u0644\u06cc\u06a9\u0646 \u0633\u0633\u062a<\/td>\n<\/tr>\n<tr>\n<td><code>learning_rate<\/code><\/td>\n<td>\u0642\u062f\u0645 \u06a9\u06d2 \u0633\u0627\u0626\u0632 \u06a9\u0648 \u0627\u067e \u0688\u06cc\u0679 \u06a9\u0631\u06cc\u06ba\u06d4<\/td>\n<td>0.0003<\/td>\n<td>\u0627\u06af\u0631 \u06cc\u06c1 \u0628\u06c1\u062a \u0632\u06cc\u0627\u062f\u06c1 \u06c1\u06d2\u060c \u062a\u0648 \u0622\u067e \u06a9\u06cc \u062a\u0631\u0628\u06cc\u062a \u063a\u06cc\u0631 \u0645\u0633\u062a\u062d\u06a9\u0645 \u06c1\u0648\u06af\u06cc\u06d4 \u0628\u06c1\u062a \u06a9\u0645: \u0633\u06cc\u06a9\u06be\u0646\u06d2 \u06a9\u06cc \u0634\u0631\u062d \u0628\u06c1\u062a \u0633\u0633\u062a \u06c1\u06d2\u06d4<\/td>\n<\/tr>\n<tr>\n<td><code>weight_decay<\/code><\/td>\n<td>\u0645\u0639\u0645\u0648\u0644 \u06a9\u06cc \u0637\u0627\u0642\u062a<\/td>\n<td>0.1<\/td>\n<td>\u0627\u0639\u0644\u06cc: \u0627\u0648\u0648\u0631 \u0641\u0679\u0646\u06af \u06a9\u0648 \u06a9\u0645 \u06a9\u0631\u062a\u0627 \u06c1\u06d2\u06d4 \u06a9\u0645: \u0627\u0648\u0648\u0631 \u0641\u0679\u0646\u06af \u06a9\u0627 \u062e\u0637\u0631\u06c1<\/td>\n<\/tr>\n<tr>\n<td><code>max_epochs<\/code><\/td>\n<td>\u0645\u06a9\u0645\u0644 \u0688\u06cc\u0679\u0627 \u0633\u06cc\u0679 \u067e\u0627\u0633<\/td>\n<td>10<\/td>\n<td>\u0645\u0632\u06cc\u062f: \u0633\u06cc\u06a9\u06be\u0646\u06d2 \u06a9\u06cc \u0635\u0644\u0627\u062d\u06cc\u062a \u06a9\u0648 \u0628\u06c1\u062a\u0631 \u0628\u0646\u0627\u062a\u0627 \u06c1\u06d2\u060c \u0644\u06cc\u06a9\u0646 \u0632\u06cc\u0627\u062f\u06c1 \u0641\u0679 \u06c1\u0648\u0646\u06d2 \u06a9\u0627 \u062e\u0637\u0631\u06c1 \u06c1\u0648\u062a\u0627 \u06c1\u06d2\u06d4 \u0686\u0646\u062f: \u0632\u06cc\u0631 \u062a\u0631\u0628\u06cc\u062a \u0645\u0627\u0688\u0644<\/td>\n<\/tr>\n<tr>\n<td><code>warmup_steps<\/code><\/td>\n<td>\u0628\u062a\u062f\u0631\u06cc\u062c LR \u0645\u06cc\u06ba \u0627\u0636\u0627\u0641\u06c1 \u06a9\u0627 \u0645\u0631\u062d\u0644\u06c1<\/td>\n<td>500<\/td>\n<td>\u0645\u0632\u06cc\u062f: \u0646\u0631\u0645 \u0634\u0631\u0648\u0639\u0627\u062a\u060c \u0645\u062d\u0641\u0648\u0638 \u062a\u0631\u0628\u06cc\u062a\u06d4 \u06a9\u0645: \u0627\u0628\u062a\u062f\u0627\u0626\u06cc \u0639\u062f\u0645 \u0627\u0633\u062a\u062d\u06a9\u0627\u0645 \u06a9\u0627 \u062e\u0637\u0631\u06c1<\/td>\n<\/tr>\n<tr>\n<td><code>grad_clip<\/code><\/td>\n<td>\u0632\u06cc\u0627\u062f\u06c1 \u0633\u06d2 \u0632\u06cc\u0627\u062f\u06c1 \u0645\u06cc\u0644\u0627\u0646 \u0642\u062f\u0631<\/td>\n<td>1.0<\/td>\n<td>\u06a9\u0645: \u0645\u0633\u062a\u062d\u06a9\u0645\u060c \u0644\u06cc\u06a9\u0646 \u0633\u06cc\u06a9\u06be\u0646\u06d2 \u0645\u06cc\u06ba \u0633\u0633\u062a\u06d4 \u0632\u06cc\u0627\u062f\u06c1: \u0688\u06be\u0644\u0648\u0627\u0646 \u062f\u06be\u0645\u0627\u06a9\u06d2 \u06a9\u0627 \u062e\u0637\u0631\u06c1<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<h3 id=\"heading-transformer-architecture\">\u0679\u0631\u0627\u0646\u0633\u0641\u0627\u0631\u0645\u0631 \u0641\u0646 \u062a\u0639\u0645\u06cc\u0631<\/h3>\n<p>\u06cc\u06c1\u0627\u06ba \u062a\u0631\u0628\u06cc\u062a \u06a9\u06d2 \u0627\u06c1\u0645 \u062d\u0635\u06d2 \u06c1\u06cc\u06ba: <strong>\u0679\u0631\u0627\u0646\u0633\u0641\u0627\u0631\u0645\u0631 \u0641\u0646 \u062a\u0639\u0645\u06cc\u0631<\/strong>. \u06a9\u0648\u0688 \u0634\u0631\u0648\u0639 \u06a9\u0631\u0646\u06d2 \u0633\u06d2 \u067e\u06c1\u0644\u06d2\u060c \u06cc\u06c1 \u062c\u0627\u0646\u0646\u0627 \u0636\u0631\u0648\u0631\u06cc \u06c1\u06d2 \u06a9\u06c1 \u0645\u062a\u0631\u062c\u0645 \u06a9\u0627 \u0641\u0646 \u062a\u0639\u0645\u06cc\u0631 \u06a9\u06cc\u0627 \u06c1\u06d2\u06d4<\/p>\n<p>\u0627\u0633 \u0628\u0627\u0631\u06d2 \u0645\u06cc\u06ba \u0645\u0632\u06cc\u062f \u062c\u0627\u0646\u0646\u06d2 \u06a9\u06d2 \u0644\u06cc\u06d2 \u06a9\u06c1 \u0679\u0631\u0627\u0646\u0633\u0641\u0627\u0631\u0645\u0631 \u06a9\u06cc\u0627 \u06c1\u06d2 \u0627\u0648\u0631 \u06cc\u06c1 RNNs \u0627\u0648\u0631 CNNs \u0633\u06d2 \u06a9\u06cc\u0633\u06d2 \u0645\u062e\u062a\u0644\u0641 \u06c1\u06d2\u060c \u06c1\u0645 \u0627\u0633 \u0645\u0636\u0645\u0648\u0646 \u06a9\u0648 \u067e\u0691\u06be\u0646\u06d2 \u06a9\u06cc \u062a\u062c\u0648\u06cc\u0632 \u06a9\u0631\u062a\u06d2 \u06c1\u06cc\u06ba: AWS: \u0645\u0635\u0646\u0648\u0639\u06cc \u0630\u06c1\u0627\u0646\u062a \u0645\u06cc\u06ba \u0679\u0631\u0627\u0646\u0633\u0641\u0627\u0631\u0645\u0631\u0632 \u06a9\u06cc\u0627 \u06c1\u06cc\u06ba\u061f<\/p>\n<p>\u0644\u06cc\u06a9\u0646 \u0645\u062e\u062a\u0635\u0631 \u0645\u06cc\u06ba:<\/p>\n<blockquote>\n<p><em>&quot;\u0627\u06cc\u06a9 \u0679\u0631\u0627\u0646\u0633\u0641\u0627\u0631\u0645\u0631 \u0646\u06cc\u0648\u0631\u0644 \u0646\u06cc\u0679 \u0648\u0631\u06a9 \u0641\u0646 \u062a\u0639\u0645\u06cc\u0631 \u06a9\u06cc \u0627\u06cc\u06a9 \u0642\u0633\u0645 \u06c1\u06d2 \u062c\u0648 \u06a9\u0633\u06cc \u0627\u0646 \u067e\u0679 \u06a9\u06cc \u062a\u0631\u062a\u06cc\u0628 \u06a9\u0648 \u0622\u0624\u0679 \u067e\u0679 \u06a9\u06cc \u062a\u0631\u062a\u06cc\u0628 \u0645\u06cc\u06ba \u062a\u0628\u062f\u06cc\u0644 \u06cc\u0627 \u062a\u0628\u062f\u06cc\u0644 \u06a9\u0631\u062a\u0627 \u06c1\u06d2\u06d4&#8221;<\/em><\/p>\n<\/blockquote>\n<p>\u0627\u0635\u0644 \u0679\u0631\u0627\u0646\u0633\u0641\u0627\u0631\u0645\u0631 \u0645\u0636\u0645\u0648\u0646 \u0645\u06cc\u06ba \u062f\u0648\u0646\u0648\u06ba \u06a9\u0648 \u0634\u0627\u0645\u0644 \u06a9\u06cc\u0627 \u06af\u06cc\u0627 \u062a\u06be\u0627\u06d4 <strong>\u0627\u0646\u06a9\u0648\u0688\u0631<\/strong> (\u0627\u0646 \u067e\u0679 \u067e\u0691\u06be\u06cc\u06ba) \u0627\u0648\u0631 <strong>\u0688\u06cc\u06a9\u0648\u0688\u0631<\/strong> (\u062c\u0646\u0631\u06cc\u0679\u0646\u06af \u0622\u0624\u0679 \u067e\u0679)\u06d4 \u062a\u0627\u06c1\u0645\u060c \u06c1\u0645\u0627\u0631\u06d2 \u062c\u06cc\u0633\u06d2 \u062c\u06cc \u067e\u06cc \u0679\u06cc \u0637\u0631\u0632 \u06a9\u06d2 \u0645\u0627\u0688\u0644 \u0635\u0631\u0641 \u0688\u06cc\u06a9\u0648\u0688\u0631 \u06a9\u0627 \u062d\u0635\u06c1 \u0627\u0633\u062a\u0639\u0645\u0627\u0644 \u06a9\u0631\u062a\u06d2 \u06c1\u06cc\u06ba\u06d4 \u06cc\u06c1 \u06c1\u06d2 <strong>\u0635\u0631\u0641 \u0688\u06cc\u06a9\u0648\u0688\u0631<\/strong> \u0641\u0646 \u062a\u0639\u0645\u06cc\u0631<\/p>\n<p>\u0688\u06cc\u06a9\u0648\u0688\u0631 \u0679\u0648\u06a9\u0646\u0632 \u06a9\u06cc \u0627\u06cc\u06a9 \u0633\u06cc\u0631\u06cc\u0632 \u0644\u06cc\u062a\u0627 \u06c1\u06d2 \u0627\u0648\u0631 \u0627\u0646 \u06a9\u0627 \u0627\u0637\u0644\u0627\u0642 \u06a9\u0631\u062a\u0627 \u06c1\u06d2\u06d4 <strong>\u0630\u0627\u062a\u06cc \u0645\u0641\u0627\u062f<\/strong> \u0627\u0646 \u06a9\u06d2 \u062f\u0631\u0645\u06cc\u0627\u0646 \u062a\u0639\u0644\u0642\u0627\u062a \u06a9\u0648 \u0633\u0645\u062c\u06be\u06cc\u06ba \u0627\u0648\u0631 \u0627\u06af\u0644\u06d2 \u0679\u0648\u06a9\u0646 \u06a9\u06cc \u067e\u06cc\u0634\u0646 \u06af\u0648\u0626\u06cc \u06a9\u0631\u06cc\u06ba\u06d4<\/p>\n<p>\u062e\u0648\u062f \u062f\u06be\u06cc\u0627\u0646 \u06c1\u06cc \u0648\u06c1 \u06c1\u06d2 \u062c\u0648 \u0679\u0631\u0627\u0646\u0633\u0688\u06cc\u0648\u0633\u0631 \u06a9\u0648 \u0637\u0627\u0642\u062a\u0648\u0631 \u0628\u0646\u0627\u062a\u0627 \u06c1\u06d2\u06d4 RNN \u06a9\u06cc \u0637\u0631\u062d \u0627\u06cc\u06a9 \u0627\u06cc\u06a9 \u06a9\u0631 \u06a9\u06d2 \u0679\u0648\u06a9\u0646 \u067e\u0631 \u06a9\u0627\u0631\u0631\u0648\u0627\u0626\u06cc \u06a9\u0631\u0646\u06d2 \u06a9\u06d2 \u0628\u062c\u0627\u0626\u06d2\u060c \u0645\u0627\u0688\u0644 \u062a\u0645\u0627\u0645 \u067e\u0686\u06be\u0644\u06d2 \u0679\u0648\u06a9\u0646\u0632 \u06a9\u0648 \u0628\u06cc\u06a9 \u0648\u0642\u062a \u062f\u06cc\u06a9\u06be\u062a\u0627 \u06c1\u06d2 \u0627\u0648\u0631 \u0627\u0633 \u0628\u0627\u062a \u06a9\u0627 \u062a\u0639\u06cc\u0646 \u06a9\u0631\u062a\u0627 \u06c1\u06d2 \u06a9\u06c1 \u06a9\u0648\u0646 \u0633\u0627 \u0679\u0648\u06a9\u0646 \u0645\u0648\u062c\u0648\u062f\u06c1 \u067e\u06cc\u0634\u06cc\u0646 \u06af\u0648\u0626\u06cc \u0633\u06d2 \u0632\u06cc\u0627\u062f\u06c1 \u0645\u062a\u0639\u0644\u0642\u06c1 \u06c1\u06d2\u06d4<\/p>\n<p>\u06cc\u06c1\u0627\u06ba \u0645\u06a9\u0645\u0644 \u06a9\u0646\u0648\u0631\u0679\u0631 \u06a9\u0648\u0688 \u06c1\u06d2: \u06c1\u0631 \u062c\u0632\u0648 \u06a9\u0627 \u062a\u0641\u0635\u06cc\u0644\u06cc \u062a\u062c\u0632\u06cc\u06c1 \u062f\u0631\u062c \u0630\u06cc\u0644 \u06c1\u06d2:<\/p>\n<pre><code class=\"language-python\">import math\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\n\nclass MultiHeadSelfAttention(nn.Module):\n    def __init__(self, config):\n        super().__init__()\n        self.n_heads = config.n_heads\n        self.d_model = config.d_model\n        self.head_dim = config.d_model \/\/ config.n_heads\n\n        self.qkv_proj = nn.Linear(config.d_model, 3 * config.d_model)\n        self.out_proj = nn.Linear(config.d_model, config.d_model)\n        self.dropout = nn.Dropout(config.dropout)\n\n    def forward(self, x, mask=None):\n        B, T, C = x.shape\n\n        qkv = self.qkv_proj(x)\n        qkv = qkv.reshape(B, T, 3, self.n_heads, self.head_dim)\n        qkv = qkv.permute(2, 0, 3, 1, 4)\n        q, k, v = qkv[0], qkv[1], qkv[2]\n\n        attn = (q @ k.transpose(-2, -1)) * (self.head_dim ** -0.5)\n\n        if mask is not None:\n            attn = attn.masked_fill(mask == 0, float('-inf'))\n\n        attn = F.softmax(attn, dim=-1)\n        attn = self.dropout(attn)\n\n        out = attn @ v\n        out = out.transpose(1, 2).reshape(B, T, C)\n        out = self.out_proj(out)\n        return out\n\n\nclass FeedForward(nn.Module):\n    def __init__(self, config):\n        super().__init__()\n        self.fc1 = nn.Linear(config.d_model, config.d_ff)\n        self.fc2 = nn.Linear(config.d_ff, config.d_model)\n        self.dropout = nn.Dropout(config.dropout)\n\n    def forward(self, x):\n        x = F.gelu(self.fc1(x))\n        x = self.dropout(x)\n        x = self.fc2(x)\n        return x\n\n\nclass TransformerBlock(nn.Module):\n    def __init__(self, config):\n        super().__init__()\n        self.ln1 = nn.LayerNorm(config.d_model)\n        self.attn = MultiHeadSelfAttention(config)\n        self.ln2 = nn.LayerNorm(config.d_model)\n        self.ff = FeedForward(config)\n        self.dropout = nn.Dropout(config.dropout)\n\n    def forward(self, x, mask=None):\n        x = x + self.dropout(self.attn(self.ln1(x), mask))\n        x = x + self.dropout(self.ff(self.ln2(x)))\n        return x\n\n\nclass UrduGPT(nn.Module):\n    def __init__(self, config):\n        super().__init__()\n        self.config = config\n\n        self.token_emb = nn.Embedding(config.vocab_size, config.d_model)\n        self.pos_emb = nn.Embedding(config.max_seq_len, config.d_model)\n        self.dropout = nn.Dropout(config.dropout)\n\n        self.blocks = nn.ModuleList([\n            TransformerBlock(config) for _ in range(config.n_layers)\n        ])\n\n        self.ln_f = nn.LayerNorm(config.d_model)\n        self.head = nn.Linear(config.d_model, config.vocab_size, bias=False)\n\n        # Weight tying\n        self.head.weight = self.token_emb.weight\n\n        self.apply(self._init_weights)\n\n    def _init_weights(self, module):\n        if isinstance(module, nn.Linear):\n            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)\n            if module.bias is not None:\n                torch.nn.init.zeros_(module.bias)\n        elif isinstance(module, nn.Embedding):\n            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)\n\n    def forward(self, input_ids, targets=None):\n        B, T = input_ids.shape\n        device = input_ids.device\n\n        tok_emb = self.token_emb(input_ids)\n        pos = torch.arange(0, T, dtype=torch.long, device=device)\n        pos_emb = self.pos_emb(pos)\n\n        x = self.dropout(tok_emb + pos_emb)\n\n        # Causal mask\n        mask = torch.tril(torch.ones(T, T, device=device)).unsqueeze(0).unsqueeze(0)\n\n        for block in self.blocks:\n            x = block(x, mask)\n\n        x = self.ln_f(x)\n        logits = self.head(x)\n\n        loss = None\n        if targets is not None:\n            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))\n\n        return {'logits': logits, 'loss': loss}\n\n    @torch.no_grad()\n    def generate(self, input_ids, max_new_tokens=100, temperature=0.8,\n                 top_k=50, top_p=0.9, eos_token_id=None):\n        \"\"\"\n        Generate text autoregressively.\n\n        Sampling strategies:\n        - temperature: Controls randomness (low = deterministic, high = creative)\n        - top_k: Only consider the top K most likely tokens\n        - top_p (nucleus): Only consider tokens whose cumulative probability <= p\n        - eos_token_id: Stop generating when this token is produced\n        \"\"\"\n        self.eval()\n        eos_token_id = eos_token_id or getattr(self.config, 'eos_token_id', None)\n\n        for _ in range(max_new_tokens):\n            idx_cond = input_ids if input_ids.size(1) <= self.config.max_seq_len \\\n                       else input_ids[:, -self.config.max_seq_len:]\n\n            outputs = self.forward(idx_cond)\n            logits = outputs[\"logits\"][:, -1, :] \/ temperature\n\n            # Top-K filtering\n            if top_k > 0:\n                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))\n                logits[logits < v[:, [-1]]] = float('-inf')\n\n            # Top-P (nucleus) filtering\n            if top_p < 1.0:\n                sorted_logits, sorted_indices = torch.sort(logits, descending=True)\n                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)\n                sorted_indices_to_remove = cumulative_probs > top_p\n                sorted_indices_to_remove[:, 1:] = sorted_indices_to_remove[:, :-1].clone()\n                sorted_indices_to_remove[:, 0] = 0\n                indices_to_remove = sorted_indices_to_remove.scatter(\n                    1, sorted_indices, sorted_indices_to_remove\n                )\n                logits[indices_to_remove] = float('-inf')\n\n            probs = F.softmax(logits, dim=-1)\n            next_token = torch.multinomial(probs, num_samples=1)\n            input_ids = torch.cat([input_ids, next_token], dim=1)\n\n            if eos_token_id is not None and next_token.item() == eos_token_id:\n                break\n\n        return input_ids\n<\/code><\/pre>\n<p>\u06cc\u06c1 \u06a9\u0648\u0688 \u0679\u06cc\u06a9\u0633\u0679 \u067e\u06cc\u0634\u0646 \u06af\u0648\u0626\u06cc \u0645\u0634\u06cc\u0646 \u0628\u0646\u0627\u062a\u0627 \u06c1\u06d2\u06d4 \u0627\u0633\u06d2 \u0627\u0631\u062f\u0648 \u06a9\u06d2 \u0686\u0646\u062f \u0627\u0644\u0641\u0627\u0638 \u062f\u06cc\u06ba \u0627\u0648\u0631 \u06cc\u06c1 \u0627\u06af\u0644\u06d2 \u0644\u0641\u0638 \u06a9\u0627 \u0627\u0646\u062f\u0627\u0632\u06c1 \u0644\u06af\u0627\u062a\u0627 \u0631\u06c1\u06d2 \u06af\u0627 \u062c\u0628 \u062a\u06a9 \u06a9\u06c1 \u06cc\u06c1 \u062c\u0645\u0644\u06c1 \u0646\u06c1 \u0628\u0646 \u062c\u0627\u0626\u06d2\u06d4 \u0644\u0641\u0638\u06cc \u0637\u0648\u0631 \u067e\u0631 \u062c\u0633 \u0637\u0631\u062d \u0633\u06d2 ChatGPT \u06a9\u0627\u0645 \u06a9\u0631\u062a\u0627 \u06c1\u06d2 \u0648\u06c1 \u0628\u06be\u06cc \u0628\u06c1\u062a \u0628\u0691\u0627 \u06c1\u06d2\u06d4<\/p>\n<h3 id=\"heading-transformer-code-breakdown\">\u06a9\u0646\u0648\u0631\u0679\u0631 \u06a9\u0648\u0688 \u06a9\u0627 \u062a\u062c\u0632\u06cc\u06c1<\/h3>\n<h4 id=\"heading-1-multiheadselfattention-the-lookback-system\">1. MultiHeadSelfAttention: &quot;\u0644\u064f\u06a9 \u0628\u06cc\u06a9 \u0633\u0633\u0679\u0645&#8221;<\/h4>\n<p>\u0627\u06cc\u06a9 \u062c\u0645\u0644\u06c1 \u067e\u0691\u06be\u0646\u06d2 \u06a9\u0627 \u062a\u0635\u0648\u0631 \u06a9\u0631\u06cc\u06ba\u06d4 \u062c\u0628 \u0622\u067e \u0644\u0641\u0638 &quot;\u0627\u0633&#8221; (\u06cc\u06c1) \u062f\u06cc\u06a9\u06be\u062a\u06d2 \u06c1\u06cc\u06ba\u060c \u062a\u0648 \u0622\u067e \u06a9\u0627 \u062f\u0645\u0627\u063a \u06cc\u06c1 \u062c\u0627\u0646\u0646\u06d2 \u06a9\u06d2 \u0644\u06cc\u06d2 \u067e\u06cc\u0686\u06be\u06d2 \u0645\u0691 \u06a9\u0631 \u062f\u06cc\u06a9\u06be\u062a\u0627 \u06c1\u06d2 \u06a9\u06c1 &quot;\u0627\u0633&#8221; \u0633\u06d2 \u06a9\u06cc\u0627 \u0645\u0631\u0627\u062f \u06c1\u06d2\u06d4 \u06cc\u06c1 \u062f\u0644\u0686\u0633\u067e\u06cc \u06a9\u06cc \u0628\u0627\u062a \u06c1\u06d2\u06d4<\/p>\n<p><strong>\u06a9\u06cc\u0648\u060c \u06a9\u06d2\u060c \u0648\u06cc<\/strong>: \u0627\u0633\u06d2 \u0627\u06cc\u06a9 \u0644\u0627\u0626\u0628\u0631\u06cc\u0631\u06cc \u06a9\u06cc \u0637\u0631\u062d \u0633\u0648\u0686\u06cc\u06ba\u06d4<\/p>\n<ul>\n<li>\n<p><strong>\u0633\u0648\u0627\u0644 (\u0633):<\/strong> &quot;\u0645\u06cc\u06ba X \u06a9\u06d2 \u0628\u0627\u0631\u06d2 \u0645\u06cc\u06ba \u0645\u0639\u0644\u0648\u0645\u0627\u062a \u062a\u0644\u0627\u0634 \u06a9\u0631 \u0631\u06c1\u0627 \u06c1\u0648\u06ba&#8221;<\/p>\n<\/li>\n<li>\n<p><strong>\u06a9\u0644\u06cc\u062f (K):<\/strong> \u06c1\u0631 \u067e\u0686\u06be\u0644\u06d2 \u0644\u0641\u0638 \u06a9\u0648 &quot;Y \u06a9\u06d2 \u0628\u0627\u0631\u06d2 \u0645\u06cc\u06ba \u0645\u0639\u0644\u0648\u0645\u0627\u062a \u06c1\u06d2&#8221; \u06a9\u06cc \u0639\u0644\u0627\u0645\u062a \u0633\u06d2 \u0646\u0634\u0627\u0646 \u0632\u062f \u06a9\u06cc\u0627 \u06af\u06cc\u0627 \u06c1\u06d2\u06d4<\/p>\n<\/li>\n<li>\n<p><strong>\u0642\u062f\u0631 (V):<\/strong> \u0627\u0635\u0644 \u0645\u0639\u0644\u0648\u0645\u0627\u062a \u062c\u0648 \u0627\u0644\u0641\u0627\u0638 \u0628\u06cc\u0627\u0646 \u06a9\u0631\u062a\u06d2 \u06c1\u06cc\u06ba\u06d4<\/p>\n<\/li>\n<\/ul>\n<p><strong>6 \u0633\u0631<\/strong> = \u0686\u06be \u0645\u062e\u062a\u0644\u0641 &quot;\u0642\u0627\u0631\u0626\u06cc\u0646&#8221; \u0627\u06cc\u06a9 \u06c1\u06cc \u0648\u0642\u062a \u0645\u06cc\u06ba \u062c\u0645\u0644\u06d2 \u06a9\u0648 \u062f\u06cc\u06a9\u06be \u0631\u06c1\u06d2 \u06c1\u06cc\u06ba\u06d4 \u06a9\u0686\u06be \u0644\u0648\u06af \u06af\u0631\u0627\u0645\u0631 \u067e\u0631 \u062a\u0648\u062c\u06c1 \u0645\u0631\u06a9\u0648\u0632 \u06a9\u0631 \u0633\u06a9\u062a\u06d2 \u06c1\u06cc\u06ba\u060c \u06a9\u0686\u06be \u0644\u0648\u06af \u0645\u0639\u0646\u06cc \u067e\u0631\u060c \u062f\u0648\u0633\u0631\u06d2 \u0642\u0631\u06cc\u0628\u06cc \u0627\u0644\u0641\u0627\u0638 \u0648\u063a\u06cc\u0631\u06c1 \u067e\u0631\u06d4<\/p>\n<p><strong>causal \u0645\u0627\u0633\u06a9<\/strong> = \u06cc\u06c1 \u0642\u0627\u0639\u062f\u06c1 \u06a9\u06c1 &quot;\u0622\u067e \u0635\u0631\u0641 \u0648\u06c1\u06cc \u0627\u0644\u0641\u0627\u0638 \u062f\u06cc\u06a9\u06be \u0633\u06a9\u062a\u06d2 \u06c1\u06cc\u06ba \u062c\u0648 \u067e\u06c1\u0644\u06d2 \u0622\u062a\u06d2 \u06c1\u06cc\u06ba \u0646\u06c1 \u06a9\u06c1 \u0628\u0639\u062f \u0645\u06cc\u06ba \u0622\u0646\u06d2 \u0648\u0627\u0644\u06d2 \u0627\u0644\u0641\u0627\u0638 \u06a9\u0648\u06d4&#8221; (\u06a9\u06cc\u0648\u0646\u06a9\u06c1 \u062a\u062e\u0644\u06cc\u0642 \u06a9\u06d2 \u0648\u0642\u062a \u0645\u0633\u062a\u0642\u0628\u0644 \u06a9\u0627 \u0644\u0641\u0638 \u0627\u0628\u06be\u06cc \u0645\u0648\u062c\u0648\u062f \u0646\u06c1\u06cc\u06ba \u06c1\u06d2!)<\/p>\n<p><strong>\u0631\u06cc\u0627\u0636\u06cc:<\/strong> &quot;\u06c1\u0631 \u0644\u0641\u0638 \u06a9\u062a\u0646\u0627 \u0645\u062a\u0639\u0644\u0642\u06c1 \u06c1\u06d2\u061f&#8221; \u062d\u0627\u0635\u0644 \u06a9\u0631\u0646\u06d2 \u06a9\u06d2 \u0644\u06cc\u06d2 Q\u00d7K \u06a9\u0648 \u0636\u0631\u0628 \u062f\u06cc\u06ba\u06d4 \u0627\u0648\u0631 \u067e\u06be\u0631 \u0627\u0633 \u0633\u06a9\u0648\u0631 \u06a9\u0627 \u0627\u0633\u062a\u0639\u0645\u0627\u0644 V \u0633\u06d2 \u0633\u0628 \u0633\u06d2 \u0645\u0641\u06cc\u062f \u0645\u0639\u0644\u0648\u0645\u0627\u062a \u062d\u0627\u0635\u0644 \u06a9\u0631\u0646\u06d2 \u06a9\u06d2 \u0644\u06cc\u06d2 \u06a9\u0631\u06cc\u06ba\u06d4<\/p>\n<h4 id=\"heading-2-feedforward-the-thinking-step\">2. \u0641\u06cc\u0688 \u0641\u0627\u0631\u0648\u0631\u0688: &quot;\u0633\u0648\u0686 \u06a9\u06d2 \u0627\u0642\u062f\u0627\u0645\u0627\u062a&#8221;<\/h4>\n<p>\u06cc\u06c1 \u0648\u06c1 \u0645\u0631\u062d\u0644\u06c1 \u06c1\u06d2 \u062c\u06c1\u0627\u06ba \u0622\u067e \u06cc\u06c1 \u062c\u0627\u0646\u0646\u06d2 \u067e\u0631 \u062a\u0648\u062c\u06c1 \u062f\u06cc\u062a\u06d2 \u06c1\u06cc\u06ba \u06a9\u06c1 \u06a9\u0648\u0646 \u0633\u06d2 \u0627\u0644\u0641\u0627\u0638 \u0627\u06c1\u0645 \u06c1\u06cc\u06ba\u060c \u0627\u0648\u0631 \u067e\u06be\u0631 \u0645\u0627\u0688\u0644 \u062f\u0631\u0627\u0635\u0644 \u0633\u0648\u0686\u062a\u0627 \u06c1\u06d2 \u06a9\u06c1 \u0627\u0646 \u06a9\u0627 \u06a9\u06cc\u0627 \u0645\u0637\u0644\u0628 \u06c1\u06d2\u06d4<\/p>\n<p>\u06cc\u06c1 \u0635\u0631\u0641 \u062f\u0648 \u067e\u0631\u062a\u06cc\u06ba \u06c1\u06cc\u06ba:<\/p>\n<ul>\n<li>\n<p><strong>\u062a\u0648\u0633\u06cc\u0639 (384 \u2192 1536):<\/strong> \u0627\u067e\u0646\u06d2 \u0645\u0627\u0688\u0644 \u06a9\u0648 \u0633\u0648\u0686\u0646\u06d2 \u06a9\u06d2 \u0644\u06cc\u06d2 \u0645\u0632\u06cc\u062f &quot;\u062f\u0645\u0627\u063a \u06a9\u06cc \u062c\u06af\u06c1&#8221; \u062f\u06cc\u06ba\u06d4<\/p>\n<\/li>\n<li>\n<p><strong>\u0633\u06a9\u0691\u06cc\u06ba (1536 \u2192 384):<\/strong> \u0627\u067e\u0646\u06d2 \u062e\u06cc\u0627\u0644\u0627\u062a \u06a9\u0648 \u062f\u0648\u0628\u0627\u0631\u06c1 \u06a9\u0645\u067e\u0631\u06cc\u0633 \u06a9\u0631\u06cc\u06ba\u06d4<\/p>\n<\/li>\n<li>\n<p><strong>GELU \u06a9\u0648 \u0686\u0627\u0644\u0648 \u06a9\u0631\u06cc\u06ba:<\/strong> \u06cc\u06c1 \u0641\u06cc\u0635\u0644\u06c1 \u06a9\u0631\u0646\u06d2 \u06a9\u06d2 \u0644\u06cc\u06d2 \u0641\u0644\u0679\u0631 \u06a9\u0631\u06cc\u06ba (\u06c1\u0645\u0648\u0627\u0631\u060c \u0633\u062e\u062a \u0646\u06c1\u06cc\u06ba)<\/p>\n<\/li>\n<\/ul>\n<h4 id=\"heading-3-transformerblock-one-round-of-reading\">3. \u0679\u0631\u0627\u0646\u0633\u0641\u0627\u0631\u0645\u0631 \u0628\u0644\u0627\u06a9: &quot;1 \u0628\u0627\u0631 \u067e\u0691\u06be\u06cc\u06ba&#8221;<\/h4>\n<p>\u06cc\u06c1 \u06a9\u0633\u06cc \u062c\u0645\u0644\u06d2 \u06a9\u0648 \u067e\u0691\u06be\u0646\u06d2 \u0627\u0648\u0631 \u0633\u0648\u0686\u0646\u06d2 \u06a9\u0627 \u0639\u0645\u0644 \u06c1\u06d2\u06d4<\/p>\n<ul>\n<li>\n<p><strong>\u0645\u0631\u062d\u0644\u06c1 1:<\/strong> \u062f\u0648\u0633\u0631\u06d2 \u0627\u0644\u0641\u0627\u0638 \u062f\u06cc\u06a9\u06be\u06cc\u06ba (\u0646\u0648\u0679)<\/p>\n<\/li>\n<li>\n<p><strong>\u0645\u0631\u062d\u0644\u06c1 2:<\/strong> \u0622\u067e \u062c\u0648 \u062f\u06cc\u06a9\u06be\u062a\u06d2 \u06c1\u06cc\u06ba \u0627\u0633 \u06a9\u06d2 \u0628\u0627\u0631\u06d2 \u0645\u06cc\u06ba \u0633\u0648\u0686\u06cc\u06ba (\u0641\u06cc\u0688 \u0641\u0627\u0631\u0648\u0631\u0688)<\/p>\n<\/li>\n<li>\n<p><strong>\u067e\u0631\u062a \u06a9\u0627 \u0645\u0639\u06cc\u0627\u0631:<\/strong> \u06cc\u06c1 \u0627\u067e\u0646\u06d2 \u062f\u0645\u0627\u063a \u06a9\u0648 \u0642\u062f\u0645\u0648\u06ba \u06a9\u06d2 \u062f\u0631\u0645\u06cc\u0627\u0646 \u0631\u06cc \u0633\u06cc\u0679 \u06a9\u0631\u0646\u06d2 \u062c\u06cc\u0633\u0627 \u06c1\u06d2 \u062a\u0627\u06a9\u06c1 \u0627\u0633 \u0628\u0627\u062a \u06a9\u0648 \u06cc\u0642\u06cc\u0646\u06cc \u0628\u0646\u0627\u06cc\u0627 \u062c\u0627 \u0633\u06a9\u06d2 \u06a9\u06c1 \u0646\u0645\u0628\u0631 \u0628\u06c1\u062a \u0632\u06cc\u0627\u062f\u06c1 \u06cc\u0627 \u0628\u06c1\u062a \u0686\u06be\u0648\u0679\u06d2 \u0646\u06c1 \u06c1\u0648\u06ba\u06d4<\/p>\n<\/li>\n<li>\n<p><strong>\u0628\u0627\u0642\u06cc \u06a9\u0646\u06a9\u0634\u0646 (<\/strong><code>x + ...<\/code><strong>):<\/strong> \u0645\u0627\u0688\u0644 \u0627\u0635\u0644 \u062e\u06cc\u0627\u0644 \u06a9\u0648 \u0628\u0631\u0642\u0631\u0627\u0631 \u0631\u06a9\u06be\u062a\u0627 \u06c1\u06d2 \u0627\u0648\u0631 \u0646\u0626\u06cc \u0628\u0635\u06cc\u0631\u062a \u06a9\u0627 \u0627\u0636\u0627\u0641\u06c1 \u06a9\u0631\u062a\u0627 \u06c1\u06d2\u06d4 \u06cc\u06c1 \u0646\u0648\u0679 \u0644\u06cc\u0646\u06d2 \u062c\u06cc\u0633\u0627 \u06c1\u06d2\u06d4 \u06cc\u06c1 \u067e\u0631\u0627\u0646\u06d2 \u0646\u0648\u0679 \u06a9\u0648 \u062d\u0630\u0641 \u06a9\u0631\u0646\u06d2 \u06a9\u06d2 \u0628\u062c\u0627\u0626\u06d2 \u0627\u06cc\u06a9 \u0646\u06cc\u0627 \u0646\u0648\u0679 \u0634\u0627\u0645\u0644 \u06a9\u0631\u062a\u0627 \u06c1\u06d2\u06d4<\/p>\n<\/li>\n<\/ul>\n<p>\u0645\u0627\u0688\u0644 \u06cc\u06c1 6 \u0628\u0627\u0631 (6 \u0628\u0644\u0627\u06a9\u0633) \u06a9\u0631\u062a\u0627 \u06c1\u06d2\u06d4 \u06c1\u0631 \u062f\u0648\u0631 \u06a9\u06d2 \u0633\u0627\u062a\u06be\u060c \u0622\u067e \u0645\u062a\u0646 \u06a9\u0648 \u0642\u062f\u0631\u06d2 \u06af\u06c1\u0631\u0627\u0626\u06cc \u0633\u06d2 \u0633\u0645\u062c\u06be\u062a\u06d2 \u06c1\u06cc\u06ba\u06d4<\/p>\n<h4 id=\"heading-4-urdugpt-the-full-machine\">4. \u0627\u0631\u062f\u0648 \u062c\u06cc \u067e\u06cc \u0679\u06cc: &quot;\u067e\u0631\u0641\u06cc\u06a9\u0679 \u0645\u0634\u06cc\u0646&#8221;<\/h4>\n<p><strong>\u062a\u0631\u062a\u06cc\u0628 (<\/strong><code>__init__<\/code><strong>):<\/strong><\/p>\n<ul>\n<li>\n<p><strong>\u0679\u0648\u06a9\u0646 \u0627\u06cc\u0645\u0628\u06cc\u0688\u0646\u06af:<\/strong> \u0628\u0691\u06cc \u062a\u0644\u0627\u0634 \u06a9\u06cc \u0645\u06cc\u0632\u06d4 \u0627\u0631\u062f\u0648 \u06a9\u06d2 32,000 \u0627\u0644\u0641\u0627\u0638\/ \u0630\u06cc\u0644\u06cc \u0627\u0644\u0641\u0627\u0638 \u0645\u06cc\u06ba \u0633\u06d2 \u06c1\u0631 \u0627\u06cc\u06a9 \u06a9\u0648 384 \u0646\u0645\u0628\u0631\u0648\u06ba \u06a9\u06cc \u0641\u06c1\u0631\u0633\u062a \u0645\u0644\u062a\u06cc \u06c1\u06d2 \u062c\u0648 \u0627\u0633 \u06a9\u06d2 &quot;\u0645\u0639\u0646\u06cc&#8221; \u06a9\u0648 \u0638\u0627\u06c1\u0631 \u06a9\u0631\u062a\u06cc \u06c1\u06d2\u06d4<\/p>\n<\/li>\n<li>\n<p><strong>\u0645\u0642\u0627\u0645 \u062f\u0627\u062e\u0644 \u06a9\u0631\u06cc\u06ba:<\/strong> \u06cc\u06c1 \u0627\u06cc\u06a9 \u0627\u0648\u0631 \u062a\u0644\u0627\u0634 \u06a9\u06cc \u0645\u06cc\u0632 \u06c1\u06d2 \u062c\u0648 \u0645\u0627\u0688\u0644 \u06a9\u0648 \u0628\u062a\u0627\u062a\u06cc \u06c1\u06d2 \u06a9\u06c1 &quot;\u06cc\u06c1 \u0644\u0641\u0638 1st \u06c1\u06d2\u060c \u06cc\u06c1 2nd \u06c1\u06d2\u060c \u06cc\u06c1 3rd \u06c1\u06d2&#8230;&#8221; (\u0628\u0635\u0648\u0631\u062a \u062f\u06cc\u06af\u0631 \u0622\u067e \u0644\u0641\u0638 \u06a9\u06cc \u062a\u0631\u062a\u06cc\u0628 \u06a9\u0648 \u0646\u06c1\u06cc\u06ba \u062c\u0627\u0646\u062a\u06d2)\u06d4<\/p>\n<\/li>\n<li>\n<p><strong>\u0686\u06be \u0679\u0631\u0627\u0646\u0633\u0641\u0627\u0631\u0645\u0631 \u0628\u0644\u0627\u06a9\u0633:<\/strong> \u06cc\u06c1 \u0627\u0648\u067e\u0631 \u0628\u06cc\u0627\u0646 \u06a9\u06cc\u06d2 \u06af\u0626\u06d2 \u0686\u06be \u067e\u0691\u06be\u0646\u06d2 \u06a9\u06d2 \u0686\u06a9\u0631 \u06c1\u06cc\u06ba\u06d4<\/p>\n<\/li>\n<li>\n<p><strong>\u0627\u06cc\u0644 \u0627\u06cc\u0645 \u0633\u0631:<\/strong> \u0622\u062e\u0631 \u0645\u06cc\u06ba\u060c \u06c1\u0645 \u0645\u0627\u0688\u0644 \u06a9\u06d2 \u0627\u0646\u062f\u0631\u0648\u0646\u06cc &quot;\u062e\u06cc\u0627\u0644\u0627\u062a&#8221; (384 \u0646\u0645\u0628\u0631\u0632) \u06a9\u0648 32,000 \u0645\u0645\u06a9\u0646\u06c1 \u0627\u06af\u0644\u06d2 \u0627\u0644\u0641\u0627\u0638 \u0645\u06cc\u06ba \u0633\u06d2 \u06c1\u0631 \u0627\u06cc\u06a9 \u06a9\u06d2 \u0644\u06cc\u06d2 \u0627\u0633\u06a9\u0648\u0631 \u0645\u06cc\u06ba \u062a\u0628\u062f\u06cc\u0644 \u06a9\u0631\u062a\u06d2 \u06c1\u06cc\u06ba\u06d4<\/p>\n<\/li>\n<li>\n<p><strong>\u067e\u0627\u0628\u0646\u062f \u0648\u0632\u0646:<\/strong> \u0627\u0646 \u067e\u0679 \u0644\u0648\u06a9 \u0627\u067e \u0679\u06cc\u0628\u0644 \u0627\u0648\u0631 \u0622\u0624\u0679 \u067e\u0679 \u0633\u06a9\u0648\u0631 \u0679\u06cc\u0628\u0644 \u0627\u06cc\u06a9 \u06c1\u06cc \u0688\u06cc\u0679\u0627 \u06a9\u0627 \u0627\u0634\u062a\u0631\u0627\u06a9 \u06a9\u0631\u062a\u06d2 \u06c1\u06cc\u06ba\u06d4 \u0645\u06cc\u0645\u0648\u0631\u06cc \u06a9\u0648 \u0628\u0686\u0627\u062a\u0627 \u06c1\u06d2 \u0627\u0648\u0631 \u062d\u0642\u06cc\u0642\u062a \u0645\u06cc\u06ba \u0628\u06c1\u062a\u0631 \u06a9\u0627\u0645 \u06a9\u0631\u062a\u0627 \u06c1\u06d2!<\/p>\n<\/li>\n<\/ul>\n<p><strong>\u067e\u0631\u0648\u0633\u06cc\u0633\u0646\u06af (<\/strong><code>forward<\/code><strong>):<\/strong><\/p>\n<ol>\n<li>\n<p>\u06c1\u0631 \u0644\u0641\u0638 \u06a9\u06d2 \u0645\u0639\u0646\u06cc \u062a\u0644\u0627\u0634 \u06a9\u0631\u06cc\u06ba (\u0627\u06cc\u0645\u0628\u06cc\u0688\u0646\u06af)<\/p>\n<\/li>\n<li>\n<p>\u0645\u0642\u0627\u0645 \u06a9\u06cc \u0645\u0639\u0644\u0648\u0645\u0627\u062a \u0634\u0627\u0645\u0644 \u06a9\u0631\u06cc\u06ba\u06d4<\/p>\n<\/li>\n<li>\n<p>\u0627\u062d\u062a\u06cc\u0627\u0637 + \u0633\u0648\u0686 \u06a9\u06d2 6 \u0645\u0631\u0627\u062d\u0644 \u067e\u0631 \u0639\u0645\u0644 \u06a9\u0631\u06cc\u06ba\u06d4<\/p>\n<\/li>\n<li>\n<p>\u0645\u0646\u062f\u0631\u062c\u06c1 \u0630\u06cc\u0644 \u062a\u0645\u0627\u0645 \u0645\u0645\u06a9\u0646\u06c1 \u0627\u0644\u0641\u0627\u0638 \u06a9\u0648 \u0627\u0633\u06a9\u0648\u0631 \u06a9\u0631\u06cc\u06ba\u06d4<\/p>\n<\/li>\n<li>\n<p>\u0627\u06af\u0631 \u0622\u067e \u06a9\u0648 \u062c\u0648\u0627\u0628 \u0645\u0639\u0644\u0648\u0645 \u06c1\u06d2 \u062a\u0648 \u062d\u0633\u0627\u0628 \u0644\u06af\u0627\u0626\u06cc\u06ba \u06a9\u06c1 \u0622\u067e \u06a9\u062a\u0646\u06d2 \u063a\u0644\u0637 \u062a\u06be\u06d2 (\u0646\u0642\u0635\u0627\u0646)<\/p>\n<\/li>\n<\/ol>\n<p><strong>\u0645\u062a\u0646 \u0628\u0646\u0627\u0626\u06cc\u06ba (<\/strong><code>generate<\/code><strong>):<\/strong> \u0633\u0627\u062f\u06c1 \u0644\u0648\u067e:<\/p>\n<ol>\n<li>\n<p>\u0627\u0628 \u062a\u06a9 \u06a9\u06d2 \u0627\u0644\u0641\u0627\u0638 \u062f\u0631\u062c \u06a9\u0631\u06cc\u06ba\u06d4<\/p>\n<\/li>\n<li>\n<p>\u0627\u06af\u0644\u06d2 \u0644\u0641\u0638 \u06a9\u06d2 \u0644\u06cc\u06d2 \u067e\u0648\u0627\u0626\u0646\u0679\u0633 \u062d\u0627\u0635\u0644 \u06a9\u0631\u06cc\u06ba\u06d4<\/p>\n<\/li>\n<li>\n<p><strong>\u062f\u0631\u062c\u06c1 \u062d\u0631\u0627\u0631\u062a:<\/strong> \u0627\u067e\u0646\u06cc \u062a\u062e\u0644\u06cc\u0642\u06cc \u0635\u0644\u0627\u062d\u06cc\u062a\u0648\u06ba \u067e\u0631 \u0642\u0627\u0628\u0648 \u067e\u0627\u0644\u06cc\u06ba\u06d4 \u06a9\u0645 = \u0645\u062d\u0641\u0648\u0638\/\u067e\u06cc\u0634 \u06af\u0648\u0626\u06cc\u060c \u0627\u0639\u0644\u06cc = \u062c\u0646\u06af\u0644\u06cc\/\u062a\u062e\u0644\u06cc\u0642\u06cc\u06d4<\/p>\n<\/li>\n<li>\n<p><strong>\u0627\u0648\u067e\u0631 K:<\/strong> \u0635\u0631\u0641 K \u06a9\u06d2 \u0628\u06c1\u062a\u0631\u06cc\u0646 \u0627\u062e\u062a\u06cc\u0627\u0631\u0627\u062a \u067e\u0631 \u063a\u0648\u0631 \u06a9\u0631\u06cc\u06ba (31,950 \u063a\u06cc\u0631 \u0645\u062a\u0648\u0642\u0639 \u0627\u0644\u0641\u0627\u0638 \u06a9\u0648 \u0646\u0638\u0631 \u0627\u0646\u062f\u0627\u0632 \u06a9\u0631\u062a\u06d2 \u06c1\u0648\u0626\u06d2)\u06d4<\/p>\n<\/li>\n<li>\n<p><strong>\u0679\u0627\u067e \u067e\u06cc (\u0627\u06cc\u0679\u0645\u06cc):<\/strong> \u0645\u062a\u062d\u0631\u06a9 \u0637\u0648\u0631 \u067e\u0631 \u0679\u0648\u06a9\u0646\u0632 \u06a9\u0627 \u0633\u0628 \u0633\u06d2 \u0686\u06be\u0648\u0679\u0627 \u0633\u06cc\u0679 \u0645\u0646\u062a\u062e\u0628 \u06a9\u0631\u062a\u0627 \u06c1\u06d2 \u062c\u0633 \u06a9\u0627 \u0645\u062c\u0645\u0648\u0639\u06cc \u0627\u0645\u06a9\u0627\u0646 \u062d\u062f \u062a\u06a9 \u067e\u06c1\u0646\u0686 \u062c\u0627\u062a\u0627 \u06c1\u06d2\u06d4<\/p>\n<\/li>\n<li>\n<p>\u0628\u0642\u06cc\u06c1 \u0627\u062e\u062a\u06cc\u0627\u0631\u0627\u062a \u0645\u06cc\u06ba \u0633\u06d2 \u0628\u06d2 \u062a\u0631\u062a\u06cc\u0628 \u0627\u06cc\u06a9 \u0644\u0641\u0638 \u06a9\u0627 \u0627\u0646\u062a\u062e\u0627\u0628 \u06a9\u0631\u06cc\u06ba\u06d4<\/p>\n<\/li>\n<li>\n<p>\u0627\u0633\u06d2 \u0627\u067e\u0646\u06d2 \u062c\u0645\u0644\u06d2 \u0645\u06cc\u06ba \u0634\u0627\u0645\u0644 \u06a9\u0631\u06cc\u06ba \u0627\u0648\u0631 \u0645\u0631\u062d\u0644\u06c1 1 \u067e\u0631 \u0648\u0627\u067e\u0633 \u062c\u0627\u0626\u06cc\u06ba\u06d4<\/p>\n<\/li>\n<li>\n<p>\u0631\u0648\u06a9\u06cc\u06ba \u0627\u06af\u0631: <code><eos\/><\/code> \u067e\u06cc\u062f\u0627 \u06a9\u06cc\u0627 \u06cc\u0627 <code>max_new_tokens<\/code> \u067e\u06c1\u0646\u0686 \u06af\u0626\u06d2<\/p>\n<\/li>\n<\/ol>\n<h3 id=\"heading-loading-the-dataset-and-training\">\u0688\u06cc\u0679\u0627\u0633\u06cc\u0679 \u0627\u0648\u0631 \u062a\u0631\u0628\u06cc\u062a \u0644\u0648\u0688 \u06a9\u0631\u06cc\u06ba\u06d4<\/h3>\n<p>\u0633\u0628 \u0633\u06d2 \u067e\u06c1\u0644\u06d2\u060c \u06c1\u0645 JSONL \u06a9\u0627\u0631\u067e\u0633 \u06a9\u0648 \u0644\u0648\u0688 \u06a9\u0631\u062a\u06d2 \u06c1\u06cc\u06ba \u0627\u0648\u0631 \u062a\u0645\u0627\u0645 \u062f\u0633\u062a\u0627\u0648\u06cc\u0632\u0627\u062a \u06a9\u0648 \u0679\u0648\u06a9\u0646 IDs \u06a9\u06cc \u0627\u06cc\u06a9 \u0637\u0648\u06cc\u0644 \u062a\u0631\u062a\u06cc\u0628 \u0645\u06cc\u06ba \u0679\u0648\u06a9\u0646\u0627\u0626\u0632 \u06a9\u0631\u062a\u06d2 \u06c1\u06cc\u06ba\u06d4 \u0627\u0633 \u06a9\u06d2 \u0628\u0639\u062f \u06c1\u0645 \u0627\u0633 90\/10 \u06a9\u0648 \u062a\u0631\u0628\u06cc\u062a \u0627\u0648\u0631 \u062a\u0648\u062b\u06cc\u0642 \u06a9\u06d2 \u0633\u06cc\u0679\u0648\u06ba \u0645\u06cc\u06ba \u062a\u0642\u0633\u06cc\u0645 \u06a9\u0631\u062a\u06d2 \u06c1\u06cc\u06ba \u0627\u0648\u0631 \u0627\u0633\u06d2 \u0627\u06cc\u06a9 PyTorch \u0688\u06cc\u0679\u0627\u0633\u06cc\u0679 \u0645\u06cc\u06ba \u0644\u067e\u06cc\u0679 \u062f\u06cc\u062a\u06d2 \u06c1\u06cc\u06ba \u062c\u0648 \u0627\u06af\u0644\u06cc \u0679\u0648\u06a9\u0646 \u067e\u06cc\u0634\u06cc\u0646 \u06af\u0648\u0626\u06cc \u06a9\u06d2 \u0644\u06cc\u06d2 \u0645\u0642\u0631\u0631\u06c1 \u0644\u0645\u0628\u0627\u0626\u06cc \u06a9\u06d2 \u0679\u06a9\u0691\u06d2 \u062a\u06cc\u0627\u0631 \u06a9\u0631\u062a\u0627 \u06c1\u06d2\u06d4<\/p>\n<pre><code class=\"language-python\">import json\nfrom tokenizers import Tokenizer\nfrom torch.utils.data import Dataset, DataLoader\nfrom tqdm import tqdm\n\n# Device\ndevice = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\nprint(f\"Using: {device}\")\n\n# Load tokenizer\ntokenizer = Tokenizer.from_file(TOKENIZER_PATH)\nprint(f\"Tokenizer loaded. Vocab: {tokenizer.get_vocab_size():,}\")\n\n# Load and tokenize corpus\nprint(\"Loading corpus...\")\nall_token_ids = []\nwith open(DATA_PATH, \"r\", encoding=\"utf-8\") as f:\n    for line in tqdm(f, desc=\"Tokenizing\"):\n        doc = json.loads(line)\n        encoded = tokenizer.encode(doc[\"text\"])\n        all_token_ids.extend(encoded.ids)\n\nall_token_ids = torch.tensor(all_token_ids, dtype=torch.long)\nprint(f\"Total tokens: {len(all_token_ids):,}\")\n<\/code><\/pre>\n<pre><code class=\"language-python\">class UrduTextDataset(Dataset):\n    def __init__(self, token_ids, seq_len):\n        self.token_ids = token_ids\n        self.seq_len = seq_len\n        self.n_chunks = (len(token_ids) - 1) \/\/ seq_len\n\n    def __len__(self):\n        return self.n_chunks\n\n    def __getitem__(self, idx):\n        start = idx * self.seq_len\n        chunk = self.token_ids[start:start + self.seq_len + 1]\n        return chunk[:-1], chunk[1:]  # input, target (shifted by 1)\n\nconfig = UrduLLMConfig()\n\n# Split 90\/10\nsplit_idx = int(len(all_token_ids) * 0.9)\ntrain_dataset = UrduTextDataset(all_token_ids[:split_idx], config.max_seq_len)\nval_dataset = UrduTextDataset(all_token_ids[split_idx:], config.max_seq_len)\n\ntrain_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)\nval_loader = DataLoader(val_dataset, batch_size=config.batch_size)\n\nprint(f\"Train: {len(train_dataset):,} chunks\")\nprint(f\"Val: {len(val_dataset):,} chunks\")\n<\/code><\/pre>\n<p>\u06c1\u0631 \u062d\u0635\u06c1 256 \u0679\u0648\u06a9\u0646 \u0644\u0645\u0628\u0627 \u06c1\u06d2\u06d4 <code>__getitem__<\/code> \u0631\u067e\u0648\u0631\u0679 <code>(input, target)<\/code> \u06cc\u06c1\u0627\u06ba \u06c1\u062f\u0641 \u0627\u06cc\u06a9 \u067e\u0648\u0632\u06cc\u0634\u0646 \u0633\u06d2 \u0645\u0646\u062a\u0642\u0644 \u06c1\u0648\u0646\u06d2 \u0648\u0627\u0644\u0627 \u0627\u0646 \u067e\u0679 \u06c1\u06d2\u060c \u062c\u0648 \u0628\u0627\u0644\u06a9\u0644 \u0648\u06c1\u06cc \u06c1\u06d2 \u062c\u0648 \u0627\u06af\u0644\u06d2 \u0679\u0648\u06a9\u0646 \u06a9\u06cc \u067e\u06cc\u0634\u0646 \u06af\u0648\u0626\u06cc \u06a9\u0631\u0646\u06d2 \u06a9\u06d2 \u0644\u06cc\u06d2 \u062f\u0631\u06a9\u0627\u0631 \u06c1\u06d2\u06d4<\/p>\n<p>\u0645\u06cc\u0631\u06d2 \u0644\u06cc\u06d2 \u062a\u0631\u0628\u06cc\u062a \u0645\u06cc\u06ba \u062a\u0642\u0631\u06cc\u0628\u0627\u064b 3 \u06af\u06be\u0646\u0679\u06d2 \u0644\u06af\u06d2 \u0627\u0648\u0631 \u0645\u06cc\u06ba \u0646\u06d2 3 \u062f\u0648\u0631 \u0645\u06a9\u0645\u0644 \u06a9\u0631 \u0644\u06cc\u06d2\u06d4 \u0628\u0646\u06cc\u0627\u062f\u06cc \u0637\u0648\u0631 \u067e\u0631\u060c \u0645\u062c\u06be\u06d2 10 \u062f\u0648\u0631 \u06a9\u0631\u0646\u06d2 \u062a\u06be\u06d2\u060c \u0644\u06cc\u06a9\u0646 3 \u06a9\u06d2 \u0628\u0639\u062f \u0645\u06cc\u06ba \u0646\u06d2 Google Colab \u06a9\u06cc \u0645\u0641\u062a \u062d\u062f \u06a9\u0648 \u0645\u0627\u0631\u0627\u06d4 \u0686\u0648\u0646\u06a9\u06c1 \u062a\u0631\u0628\u06cc\u062a \u06a9\u0627 \u0645\u0642\u0635\u062f \u0633\u06cc\u06a9\u06be\u0646\u0627 \u06c1\u06d2\u060c \u0627\u0633 \u0644\u06cc\u06d2 \u0645\u06cc\u06ba \u0646\u06d2 \u0628\u0646\u0627\u06cc\u0627 \u06c1\u0648\u0627 \u0645\u0627\u0688\u0644 \u0627\u0633\u062a\u0639\u0645\u0627\u0644 \u06a9\u06cc\u0627 \u0627\u0648\u0631 \u0627\u0633\u06d2 \u0688\u0631\u0627\u0626\u06cc\u0648 \u0645\u06cc\u06ba \u0645\u062d\u0641\u0648\u0638 \u06a9\u0631 \u0644\u06cc\u0627\u06d4<\/p>\n<p>\u0645\u06a9\u0645\u0644 \u062a\u0631\u0628\u06cc\u062a\u06cc \u06a9\u0648\u0688 \u0630\u06cc\u0644 \u0645\u06cc\u06ba \u06c1\u06d2:<\/p>\n<pre><code class=\"language-python\"># Optimizer\noptimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay)\n\n# LR Schedule\ntotal_steps = len(train_loader) * config.max_epochs\ndef get_lr(step):\n    if step < config.warmup_steps:\n        return config.learning_rate * step \/ config.warmup_steps\n    progress = (step - config.warmup_steps) \/ (total_steps - config.warmup_steps)\n    return config.learning_rate * 0.5 * (1 + math.cos(math.pi * progress))\n\n# Training\nhistory = {'train_loss': [], 'val_loss': []}\nglobal_step = 0\nbest_val_loss = float('inf')\n\nfor epoch in range(config.max_epochs):\n    model.train()\n    epoch_loss = 0\n    pbar = tqdm(train_loader, desc=f\"Epoch {epoch+1}\")\n\n    for input_ids, targets in pbar:\n        input_ids, targets = input_ids.to(device), targets.to(device)\n\n        lr = get_lr(global_step)\n        for g in optimizer.param_groups:\n            g['lr'] = lr\n\n        outputs = model(input_ids, targets)\n        loss = outputs['loss']\n\n        optimizer.zero_grad()\n        loss.backward()\n        torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_clip)\n        optimizer.step()\n\n        epoch_loss += loss.item()\n        global_step += 1\n        pbar.set_postfix({'loss': f'{loss.item():.4f}'})\n\n    # Validation\n    model.eval()\n    val_loss = 0\n    with torch.no_grad():\n        for input_ids, targets in val_loader:\n            input_ids, targets = input_ids.to(device), targets.to(device)\n            val_loss += model(input_ids, targets)['loss'].item()\n    val_loss \/= len(val_loader)\n\n    train_loss = epoch_loss \/ len(train_loader)\n    history['train_loss'].append(train_loss)\n    history['val_loss'].append(val_loss)\n\n    print(f\"Epoch {epoch+1}: Train={train_loss:.4f}, Val={val_loss:.4f}\")\n\n    # Save best\n    if val_loss < best_val_loss:\n        best_val_loss = val_loss\n        torch.save(model.state_dict(), f\"{DRIVE_PATH}\/best_model.pt\")\n        print(f\"Best model saved!\")\n\nprint(f\"\\nDone! Best val loss: {best_val_loss:.4f}\")\n<\/code><\/pre>\n<p>\u0627\u0628 \u0622\u0626\u06cc\u06d2 \u062a\u062c\u0632\u06cc\u06c1 \u06a9\u0631\u062a\u06d2 \u06c1\u06cc\u06ba \u06a9\u06c1 \u062a\u0631\u0628\u06cc\u062a\u06cc \u06a9\u0648\u0688 \u06a9\u0627 \u06c1\u0631 \u062d\u0635\u06c1 \u06a9\u06cc\u0627 \u06a9\u0631\u062a\u0627 \u06c1\u06d2\u06d4<\/p>\n<h3 id=\"heading-training-code-explained-line-by-line\">\u0633\u06cc\u06a9\u06be\u0646\u06d2 \u06a9\u06d2 \u06a9\u0648\u0688 \u06a9\u06cc \u0648\u0636\u0627\u062d\u062a: \u0644\u0627\u0626\u0646 \u0628\u06c1 \u0644\u0627\u0626\u0646<\/h3>\n<h4 id=\"heading-1-optimizer-setup\">1. \u0622\u067e\u0679\u06cc\u0645\u0627\u0626\u0632\u0631 \u06a9\u06cc \u062a\u0631\u062a\u06cc\u0628\u0627\u062a<\/h4>\n<pre><code class=\"language-python\">optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay)\n<\/code><\/pre>\n<p><code>AdamW<\/code>    \u0641\u06cc \u067e\u06cc\u0631\u0627\u0645\u06cc\u0679\u0631 \u067e\u0631 \u0639\u0645\u0644\u062f\u0631\u0622\u0645\u062f \u06a9\u06d2 \u062f\u0648 \u0627\u0639\u062f\u0627\u062f\u0648\u0634\u0645\u0627\u0631 \u06a9\u0648 \u0628\u0631\u0642\u0631\u0627\u0631 \u0631\u06a9\u06be\u062a\u0627 \u06c1\u06d2 (23M \u00d7 2 = 46M \u0627\u0636\u0627\u0641\u06cc \u0642\u062f\u0631 \u0645\u06cc\u0645\u0648\u0631\u06cc \u0645\u06cc\u06ba)\u06d4<\/p>\n<ul>\n<li>\n<p><strong>\u067e\u06c1\u0644\u0627 \u0644\u0645\u062d\u06c1 (\u0645\u0648\u0645\u06cc\u0646\u0679\u0645):<\/strong> \u0688\u06be\u0644\u0648\u0627\u0646 \u06a9\u06cc \u0627\u06cc\u06a9 \u062a\u06cc\u0632 \u0631\u0641\u062a\u0627\u0631 \u062d\u0631\u06a9\u062a \u0627\u0648\u0633\u0637\u06d4 \u0634\u0648\u0631 \u0645\u0686\u0627\u0646\u06d2 \u0648\u0627\u0644\u06cc \u0627\u067e \u0688\u06cc\u0679\u0633 \u06a9\u0648 \u06c1\u0645\u0648\u0627\u0631 \u06a9\u0631\u06cc\u06ba \u062a\u0627\u06a9\u06c1 \u0622\u067e\u0679\u0645\u0627\u0626\u0632\u0631 \u0679\u06cc\u0691\u06be\u0627 \u0646\u06c1 \u06c1\u0648\u06d4<\/p>\n<\/li>\n<li>\n<p><strong>\u062f\u0648\u0633\u0631\u0627 \u0644\u0645\u062d\u06c1:<\/strong> \u0645\u0631\u0628\u0639 \u0688\u06be\u0644\u0648\u0627\u0646 \u06a9\u0627 \u0627\u06cc\u06a9 \u06a9\u0641\u0627\u06cc\u062a\u06cc \u062d\u0631\u06a9\u062a \u067e\u0630\u06cc\u0631\u06cc \u0627\u0648\u0633\u0637\u06d4 \u06c1\u0631 \u067e\u06cc\u0631\u0627\u0645\u06cc\u0679\u0631 \u06a9\u06d2 \u0644\u06cc\u06d2 \u0627\u06cc\u06a9 \u0645\u0646\u0641\u0631\u062f \u0627\u0646\u06a9\u0648\u0644\u06cc \u0633\u06cc\u06a9\u06be\u0646\u06d2 \u06a9\u06cc \u0634\u0631\u062d \u0641\u0631\u0627\u06c1\u0645 \u06a9\u0631\u062a\u0627 \u06c1\u06d2 (\u06a9\u062b\u0631\u062a \u0633\u06d2 \u0627\u067e \u0688\u06cc\u0679 \u06c1\u0648\u0646\u06d2 \u0648\u0627\u0644\u06d2 \u067e\u06cc\u0631\u0627\u0645\u06cc\u0679\u0631\u0632 \u06a9\u06d2 \u0644\u06cc\u06d2 \u0686\u06be\u0648\u0679\u06d2 \u0627\u0642\u062f\u0627\u0645\u0627\u062a \u0627\u0648\u0631 \u06a9\u0628\u06be\u06cc \u06a9\u0628\u06be\u0627\u0631 \u067e\u06cc\u0631\u0627\u0645\u06cc\u0679\u0631\u0632 \u06a9\u06d2 \u0644\u06cc\u06d2 \u0628\u0691\u06d2 \u0627\u0642\u062f\u0627\u0645\u0627\u062a)\u06d4<\/p>\n<\/li>\n<li>\n<p><strong>\u0648\u0632\u0646 \u0645\u06cc\u06ba \u06a9\u0645\u06cc (0.1):<\/strong> \u06c1\u0631 \u0642\u062f\u0645 \u067e\u0631 \u0648\u0632\u0646 \u06a9\u0648 \u0636\u0631\u0628 \u062f\u06cc\u0627 \u062c\u0627\u062a\u0627 \u06c1\u06d2\u06d4 <code>(1 - lr \u00d7 0.1)<\/code>\u062a\u06be\u0648\u0691\u0627 \u0633\u0627 \u0632\u0648\u0645 \u0622\u0624\u0679 \u06a9\u0631\u06cc\u06ba\u06d4 \u06cc\u06c1 \u06c1\u06d2 <strong>L2 \u0631\u06cc\u06af\u0648\u0644\u0631\u0627\u0626\u0632\u06cc\u0634\u0646<\/strong>. \u0627\u06cc\u06a9 \u0648\u0632\u0646 \u06a9\u0648 \u0628\u06c1\u062a \u0632\u06cc\u0627\u062f\u06c1 \u06c1\u0648\u0646\u06d2 \u0633\u06d2 \u0631\u0648\u06a9 \u06a9\u0631 \u0627\u0648\u0648\u0631 \u0641\u0679\u0646\u06af \u06a9\u0648 \u06a9\u0645 \u06a9\u0631\u062a\u0627 \u06c1\u06d2\u06d4 \u0627\u06cc\u0688\u0645 \u0688\u0628\u0644\u06cc\u0648 \u0645\u06cc\u06ba \"\u0688\u0628\u0644\u06cc\u0648\" \u06a9\u0627 \u0645\u0637\u0644\u0628 \u06c1\u06d2 \u06a9\u06c1 \u0627\u0633 \u06a9\u0634\u06cc \u06a9\u0648 \u06af\u0631\u06cc\u0688\u06cc\u0646\u0679 \u0627\u067e \u0688\u06cc\u0679 \u0633\u06d2 \u0645\u0644\u0627\u06cc\u0627 \u062c\u0627\u062a\u0627 \u06c1\u06d2 (\u0648\u0646\u06cc\u0644\u0627 \u0627\u06cc\u0688\u0645 \u06a9\u06cc \u0637\u0631\u062d \u0645\u06cc\u0644\u0627\u0646 \u0645\u06cc\u06ba \u0645\u0644\u0627\u0646\u06d2 \u06a9\u06d2 \u0628\u062c\u0627\u0626\u06d2 \u0628\u0631\u0627\u06c1 \u0631\u0627\u0633\u062a \u0648\u0632\u0646 \u067e\u0631 \u0644\u0627\u06af\u0648 \u06c1\u0648\u062a\u0627 \u06c1\u06d2)\u06d4<\/p>\n<\/li>\n<\/ul>\n<h4 id=\"heading-2-learning-rate-schedule\">2. \u0633\u06cc\u06a9\u06be\u0646\u06d2 \u06a9\u06cc \u0634\u0631\u062d \u06a9\u0627 \u0634\u06cc\u0688\u0648\u0644<\/h4>\n<pre><code class=\"language-python\">total_steps = len(train_loader) * config.max_epochs  # e.g., 500 batches \u00d7 10 epochs = 5000 steps\n\ndef get_lr(step):\n    if step < config.warmup_steps:                                      # Phase 1: steps 0\u2013499\n        return config.learning_rate * step \/ config.warmup_steps        # Linear ramp: 0 \u2192 3e-4\n    progress = (step - config.warmup_steps) \/ (total_steps - config.warmup_steps)  # 0.0 \u2192 1.0\n    return config.learning_rate * 0.5 * (1 + math.cos(math.pi * progress))        # 3e-4 \u2192 ~0\n<\/code><\/pre>\n<ul>\n<li>\n<p><strong>\u0648\u0627\u0631\u0645 \u0627\u067e (\u067e\u06c1\u0644\u06d2 500 \u0645\u0631\u0627\u062d\u0644):<\/strong> \u0645\u0631\u062d\u0644\u06d2 0 \u0645\u06cc\u06ba\u060c \u0648\u0632\u0646 \u0628\u06d2 \u062a\u0631\u062a\u06cc\u0628 \u06c1\u0648\u062a\u06d2 \u06c1\u06cc\u06ba \u0627\u0648\u0631 \u06af\u0631\u06cc\u0688\u06cc\u0626\u0646\u0679\u0633 \u0646\u06cc\u0645 \u0628\u06d2 \u062a\u0631\u062a\u06cc\u0628 \u0633\u0645\u062a\u0648\u06ba \u06a9\u06cc \u0637\u0631\u0641 \u0627\u0634\u0627\u0631\u06c1 \u06a9\u0631\u062a\u06d2 \u06c1\u06cc\u06ba\u060c \u0644\u06c1\u0630\u0627 \u0627\u06cc\u06a9 \u0628\u0691\u0627 LR \u062a\u0628\u0627\u06c1 \u06a9\u0646 \u067e\u06cc\u0631\u0627\u0645\u06cc\u0679\u0631 \u0627\u067e \u0688\u06cc\u0679\u0633 \u06a9\u0627 \u0646\u062a\u06cc\u062c\u06c1 \u06c1\u0648\u06af\u0627\u06d4 0 \u0633\u06d2 3e-4 \u062a\u06a9 \u0644\u06a9\u06cc\u0631\u06cc \u0637\u0648\u0631 \u067e\u0631 \u0627\u0636\u0627\u0641\u06c1 \u06a9\u0631\u06a9\u06d2\u060c \u06c1\u0645 \u062c\u0627\u0631\u062d\u0627\u0646\u06c1 \u0627\u067e\u0688\u06cc\u0679\u0633 \u06a9\u0631\u0646\u06d2 \u0633\u06d2 \u067e\u06c1\u0644\u06d2 \u0646\u0642\u0635\u0627\u0646 \u06a9\u06d2 \u0645\u0627\u062d\u0648\u0644 \u06a9\u0648 \"\u0645\u0633\u062a\u062d\u06a9\u0645\" \u06a9\u0631\u062a\u06d2 \u06c1\u06cc\u06ba\u06d4<\/p>\n<\/li>\n<li>\n<p><strong>\u06a9\u0648\u0632\u0627\u0626\u0646 \u06a9\u0627 \u062e\u0627\u062a\u0645\u06c1 (\u0628\u0627\u0642\u06cc \u0645\u0631\u0627\u062d\u0644):<\/strong> \u0633\u0631\u06a9\u0627\u0631\u06cc <code>0.5 \u00d7 (1 + cos(\u03c0 \u00d7 progress))<\/code> \u062c\u06cc\u0633\u06d2 \u062c\u06cc\u0633\u06d2 0 \u0633\u06d2 1 \u062a\u06a9 \u062a\u0631\u0642\u06cc \u06c1\u0648\u062a\u06cc \u06c1\u06d2\u060c \u06cc\u06c1 1.0 \u0633\u06d2 0.0 \u062a\u06a9 \u0627\u06cc\u06a9 \u06c1\u0645\u0648\u0627\u0631 S-\u06a9\u0631\u0648 \u06a9\u0627 \u067e\u062a\u06c1 \u0644\u06af\u0627\u062a\u0627 \u06c1\u06d2\u06d4 \u0686\u0648\u0679\u06cc LR \u06a9\u0648 \u0636\u0631\u0628 \u062f\u06cc\u0646\u06d2 \u0633\u06d2 \u0645\u0644\u062a\u0627 \u06c1\u06d2:<\/p>\n<\/li>\n<\/ul>\n<pre><code class=\"language-plaintext\">LR:  0 \u2500\u2500ramp\u2500\u2500&#x25b6; peak \u2500\u2500smooth curve\u2500\u2500&#x25b6; ~0\n     |  warmup  |     cosine decay      |\n<\/code><\/pre>\n<h4 id=\"heading-3-tracking-variables\">3. \u0645\u062a\u063a\u06cc\u0631\u0627\u062a \u06a9\u0648 \u0679\u0631\u06cc\u06a9 \u06a9\u0631\u0646\u0627<\/h4>\n<pre><code class=\"language-python\">history = {'train_loss': [], 'val_loss': []}   # For plotting curves later\nglobal_step = 0                                 # Counts total batches across all epochs (for LR schedule)\nbest_val_loss = float('inf')                    # Tracks best validation; starts at infinity so any real loss beats it\n<\/code><\/pre>\n<h4 id=\"heading-4-training-loop\">4. \u0679\u0631\u06cc\u0646\u0646\u06af \u0644\u0648\u067e<\/h4>\n<p><strong>\u0628\u06cc\u0631\u0648\u0646\u06cc \u0644\u0648\u067e: \u0639\u06c1\u062f<\/strong><\/p>\n<pre><code class=\"language-python\">for epoch in range(config.max_epochs):\n    model.train()     # Enables dropout (randomly zeros 10% of activations for regularization)\n<\/code><\/pre>\n<p>\u06c1\u0631 \u062f\u0648\u0631 = \u062a\u0645\u0627\u0645 \u062a\u0631\u0628\u06cc\u062a\u06cc \u0688\u06cc\u0679\u0627 \u0633\u06d2 \u0627\u06cc\u06a9 \u0645\u06a9\u0645\u0644 \u067e\u0627\u0633\u06d4 \u06c1\u0645 \u062f\u06c1\u0631\u0627\u062a\u06d2 \u06c1\u06cc\u06ba <code>max_epochs<\/code> \u06af\u0648\u0644<\/p>\n<p><strong>\u0627\u0646\u062f\u0631\u0648\u0646\u06cc \u0644\u0648\u067e: \u062c\u06af\u06c1 \u06a9\u0627 \u062a\u0639\u06cc\u0646<\/strong><\/p>\n<p><strong>1. GPU \u067e\u0631 \u062c\u0627\u0626\u06cc\u06ba:<\/strong><\/p>\n<pre><code class=\"language-python\">input_ids, targets = input_ids.to(device), targets.to(device)\n<\/code><\/pre>\n<p>\u0679\u06cc\u0646\u0633\u0631 \u0688\u06cc\u0679\u0627 \u06a9\u0648 CPU RAM \u0633\u06d2 GPU VRAM \u0645\u06cc\u06ba \u0645\u0646\u062a\u0642\u0644 \u06a9\u0631\u06cc\u06ba\u06d4 \u0679\u0631\u0627\u0646\u0633\u0641\u0627\u0631\u0645\u0631 \u0645\u06cc\u06ba \u0645\u06cc\u0679\u0631\u06a9\u0633 \u0636\u0631\u0628 (\u062a\u0648\u062c\u06c1\u060c FFN) \u0628\u0691\u06d2 \u067e\u06cc\u0645\u0627\u0646\u06d2 \u067e\u0631 \u0645\u062a\u0648\u0627\u0632\u06cc \u06c1\u0648\u0646\u06d2 \u06a9\u06cc \u0648\u062c\u06c1 \u0633\u06d2 GPUs \u067e\u0631 50 \u0633\u06d2 100 \u06af\u0646\u0627 \u062a\u06cc\u0632 \u0686\u0644\u062a\u06cc \u06c1\u06d2\u06d4<\/p>\n<p><strong>2. \u062f\u0633\u062a\u06cc LR \u0627\u067e \u0688\u06cc\u0679:<\/strong><\/p>\n<pre><code class=\"language-python\">lr = get_lr(global_step)\nfor g in optimizer.param_groups:\n    g['lr'] = lr\n<\/code><\/pre>\n<p>PyTorch \u06a9\u0627 AdamW \u0645\u0642\u0627\u0645\u06cc \u0637\u0648\u0631 \u067e\u0631 \u0635\u0627\u0631\u0641 \u06a9\u06d2 \u0637\u06d2 \u0634\u062f\u06c1 \u0646\u0638\u0627\u0645 \u0627\u0644\u0627\u0648\u0642\u0627\u062a \u06a9\u06cc \u062d\u0645\u0627\u06cc\u062a \u0646\u06c1\u06cc\u06ba \u06a9\u0631\u062a\u0627 \u06c1\u06d2\u060c \u0627\u0633 \u0644\u06cc\u06d2 \u06c1\u0645 \u06c1\u0631 \u0642\u062f\u0645 \u067e\u0631 \u0627\u06cc\u0644 \u0622\u0631 \u06a9\u0648 \u062f\u0633\u062a\u06cc \u0637\u0648\u0631 \u067e\u0631 \u0627\u0648\u0648\u0631 \u0631\u0627\u0626\u06cc\u0688 \u06a9\u0631\u062a\u06d2 \u06c1\u06cc\u06ba\u06d4 <code>param_groups<\/code> \u0627\u06cc\u06a9 \u0641\u06c1\u0631\u0633\u062a \u06c1\u06d2 (\u06cc\u06c1\u0627\u06ba \u0627\u06cc\u06a9 \u06af\u0631\u0648\u067e) \u0627\u0648\u0631 \u06c1\u0631 \u06af\u0631\u0648\u067e \u06a9\u0627 \u0627\u067e\u0646\u0627 LR\/\u0648\u0632\u0646 \u0645\u06cc\u06ba \u06a9\u0645\u06cc \u06c1\u0648 \u0633\u06a9\u062a\u06cc \u06c1\u06d2\u06d4<\/p>\n<p><strong>3. \u0641\u0627\u0631\u0648\u0631\u0688 \u067e\u0627\u0633:<\/strong><\/p>\n<pre><code class=\"language-python\">outputs = model(input_ids, targets)\nloss = outputs['loss']\n<\/code><\/pre>\n<p>\u0627\u0646 \u067e\u0679 \u0679\u0648\u06a9\u0646 \u0627\u06cc\u0645\u0628\u06cc\u0688\u0646\u06af \u2192 \u0686\u06be \u0679\u0631\u0627\u0646\u0633\u0688\u06cc\u0648\u0633\u0631 \u0628\u0644\u0627\u06a9\u0633 \u2192 LM \u06c1\u06cc\u0688 \u2192 \u0644\u0627\u06af\u0679 \u0633\u06d2 \u06af\u0632\u0631\u062a\u06d2 \u06c1\u06cc\u06ba\u06d4 \u06a9\u0631\u0627\u0633 \u0627\u06cc\u0646\u0679\u0631\u0648\u067e\u06cc \u0646\u0642\u0635\u0627\u0646 \u06a9\u0627 \u062d\u0633\u0627\u0628 \u0644\u0627\u06af\u0679\u0633 (\u0634\u06a9\u0644\u0648\u06ba) \u06a9\u06d2 \u062f\u0631\u0645\u06cc\u0627\u0646 \u06a9\u06cc\u0627 \u062c\u0627\u062a\u0627 \u06c1\u06d2\u06d4 <code>[batch, seq_len, 32000]<\/code>) \u0627\u0648\u0631 \u06c1\u062f\u0641 \u0679\u0648\u06a9\u0646 ID\u06d4 \u06cc\u06c1 \u0646\u0642\u0635\u0627\u0646 \u0627\u0633 \u0645\u0646\u0641\u06cc \u0644\u0627\u06af \u0627\u0645\u06a9\u0627\u0646 \u06a9\u06cc \u067e\u06cc\u0645\u0627\u0626\u0634 \u06a9\u0631\u062a\u0627 \u06c1\u06d2 \u062c\u0648 \u0645\u0627\u0688\u0644 \u0627\u06af\u0644\u06d2 \u062f\u0631\u0633\u062a \u0679\u0648\u06a9\u0646 \u06a9\u0648 \u062a\u0641\u0648\u06cc\u0636 \u06a9\u0631\u062a\u0627 \u06c1\u06d2\u060c \u062c\u0633 \u06a9\u0627 \u0627\u0648\u0633\u0637 \u062a\u0645\u0627\u0645 \u067e\u0648\u0632\u06cc\u0634\u0646 \u0627\u0648\u0631 \u067e\u0644\u06cc\u0633\u0645\u0646\u0679 \u06a9\u06d2 \u0639\u0648\u0627\u0645\u0644 \u067e\u0631 \u06c1\u0648\u062a\u0627 \u06c1\u06d2\u06d4<\/p>\n<p><strong>4. \u0631\u06cc\u0648\u0631\u0633 \u067e\u0627\u0633 + \u0627\u067e \u0688\u06cc\u0679:<\/strong><\/p>\n<pre><code class=\"language-python\">optimizer.zero_grad()          # Reset all parameter gradients to zero (they accumulate by default)\nloss.backward()                # Backpropagation: compute \u2202loss\/\u2202\u03b8 for all 23M parameters via chain rule\ntorch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_clip)  # If ||gradient||\u2082 > 1.0, scale it down\noptimizer.step()               # \u03b8_new = \u03b8_old - lr \u00d7 adam_adjusted_gradient - lr \u00d7 weight_decay \u00d7 \u03b8_old\n<\/code><\/pre>\n<ul>\n<li>\n<p><code>zero_grad()<\/code><strong>:<\/strong> PyTorch \u067e\u06c1\u0644\u06d2 \u0633\u06d2 \u0637\u06d2 \u0634\u062f\u06c1 \u0637\u0648\u0631 \u067e\u0631 \u06af\u0631\u06cc\u0688\u06cc\u0646\u0679 \u062c\u0645\u0639 \u06a9\u0631\u062a\u0627 \u06c1\u06d2 (\u0645\u0627\u0626\u06cc\u06a9\u0631\u0648 \u0628\u06cc\u0686\u0648\u06ba \u0645\u06cc\u06ba \u06af\u0631\u06cc\u0688\u06cc\u0646\u0679 \u062c\u0645\u0639 \u06a9\u0631\u0646\u06d2 \u06a9\u06d2 \u0644\u06cc\u06d2 \u0645\u0641\u06cc\u062f)\u06d4 \u06c1\u0631 \u0646\u0626\u06d2 \u067e\u0633\u0645\u0627\u0646\u062f\u06c1 \u067e\u0627\u0633 \u0633\u06d2 \u067e\u06c1\u0644\u06d2 \u0627\u0633\u06d2 \u062f\u0633\u062a\u06cc \u0637\u0648\u0631 \u067e\u0631 \u0635\u0627\u0641 \u06a9\u0631\u0646\u0627 \u0636\u0631\u0648\u0631\u06cc \u06c1\u06d2\u06d4<\/p>\n<\/li>\n<li>\n<p><code>loss.backward()<\/code><strong>:<\/strong> \u0628\u06cc\u06a9 \u067e\u0631\u0648\u067e\u06cc\u06af\u06cc\u0634\u0646 \u06a9\u0645\u067e\u06cc\u0648\u0679\u06cc\u0634\u0646\u0644 \u06af\u0631\u0627\u0641 \u06a9\u0648 \u067e\u06cc\u0686\u06be\u06d2 \u06a9\u06cc \u0637\u0631\u0641 \u0644\u06d2 \u062c\u0627\u062a\u0627 \u06c1\u06d2 \u0627\u0648\u0631 \u062a\u0645\u0627\u0645 \u067e\u06cc\u0631\u0627\u0645\u06cc\u0679\u0631\u0632 \u06a9\u06d2 \u0644\u06cc\u06d2 \u2202loss\/\u2202\u03b8 \u06a9\u06cc \u06af\u0646\u062a\u06cc \u06a9\u0631\u0646\u06d2 \u06a9\u06d2 \u0644\u06cc\u06d2 \u0686\u06cc\u0646 \u06a9\u06d2 \u0627\u0635\u0648\u0644 \u06a9\u0627 \u0627\u0633\u062a\u0639\u0645\u0627\u0644 \u06a9\u0631\u062a\u0627 \u06c1\u06d2\u06d4 \u06cc\u06c1\u060c \u0641\u0627\u0631\u0648\u0631\u0688 \u0641\u0627\u0631\u0648\u0631\u0688\u0646\u06af \u06a9\u06d2 \u0633\u0627\u062a\u06be\u060c \u0633\u0628 \u0633\u06d2 \u0632\u06cc\u0627\u062f\u06c1 \u06a9\u0645\u067e\u06cc\u0648\u0679 \u06a9\u0631\u0646\u06d2 \u0648\u0627\u0644\u0627 \u0642\u062f\u0645 \u06c1\u06d2\u06d4<\/p>\n<\/li>\n<li>\n<p><strong>\u062a\u062f\u0631\u06cc\u062c\u06cc \u062a\u0631\u0627\u0634\u0646\u0627:<\/strong> \u0627\u06cc\u06a9 \u0648\u06cc\u06a9\u0679\u0631 \u06a9\u06d2 \u0630\u0631\u06cc\u0639\u06d2 \u062c\u0691\u06d2 \u06c1\u0648\u0626\u06d2 \u062a\u0645\u0627\u0645 \u067e\u06cc\u0631\u0627\u0645\u06cc\u0679\u0631 \u06af\u0631\u06cc\u0688\u06cc\u0646\u0679 \u067e\u0631 L2 \u0645\u0639\u0645\u0648\u0644 \u06a9\u06cc \u06af\u0646\u062a\u06cc \u06a9\u0631\u06cc\u06ba\u06d4 \u0627\u06af\u0631 Norm 1.0 \u0633\u06d2 \u0632\u06cc\u0627\u062f\u06c1 \u06c1\u0648 \u062a\u0648\u060c \u062a\u0645\u0627\u0645 \u06af\u0631\u06cc\u0688\u06cc\u0626\u0646\u0679\u0633 \u06a9\u0648 \u0627\u0633 \u0633\u06d2 \u0636\u0631\u0628 \u062f\u06cc\u0627 \u062c\u0627\u062a\u0627 \u06c1\u06d2: <code>1.0\/norm<\/code>\u0633\u0645\u062a \u06a9\u0648 \u0628\u0631\u0642\u0631\u0627\u0631 \u0631\u06a9\u06be\u062a\u0627 \u06c1\u06d2 \u0644\u06cc\u06a9\u0646 \u0633\u0627\u0626\u0632 \u06a9\u0648 \u0645\u062d\u062f\u0648\u062f \u06a9\u0631\u062a\u0627 \u06c1\u06d2\u06d4 \u06cc\u06c1 \u0627\u0633\u067e\u0627\u0631\u0633 \u067e\u0644\u06cc\u0633\u0645\u0646\u0679 (\u063a\u06cc\u0631 \u0645\u0639\u0645\u0648\u0644\u06cc \u0679\u0648\u06a9\u0646 \u06a9\u06cc \u062a\u0642\u0633\u06cc\u0645) \u06a9\u0648 \u062a\u0628\u0627\u06c1 \u06a9\u0646 \u0628\u0691\u06d2 \u0627\u067e \u0688\u06cc\u0679\u0633 \u0633\u06d2 \u0631\u0648\u06a9\u062a\u0627 \u06c1\u06d2 \u062c\u0648 \u062a\u0631\u0628\u06cc\u062a \u06a9\u0648 \u063a\u06cc\u0631 \u0645\u0633\u062a\u062d\u06a9\u0645 \u06a9\u0631\u062a\u06d2 \u06c1\u06cc\u06ba\u06d4<\/p>\n<\/li>\n<li>\n<p><code>optimizer.step()<\/code><strong>:<\/strong> \u0627\u06cc\u0688\u0645 \u0688\u0628\u0644\u06cc\u0648 \u0645\u0648\u0645\u06cc\u0646\u0679\u0645\u060c \u067e\u06cc\u0631\u0627\u0645\u06cc\u0679\u0631 \u06a9\u06d2 \u0644\u06cc\u06d2 \u0645\u062e\u0635\u0648\u0635 \u0627\u0646\u0688\u06cc\u067e\u0679\u06cc\u0648 \u0627\u06cc\u0644 \u0622\u0631\u060c \u0627\u0648\u0631 \u0688\u06cc\u06a9\u067e\u0644\u0688 \u0648\u06cc\u0679 \u0688\u06d2 \u06a9\u0627 \u0627\u0633\u062a\u0639\u0645\u0627\u0644 \u06a9\u0631\u062a\u06d2 \u06c1\u0648\u0626\u06d2 \u0627\u067e \u0688\u06cc\u0679 \u06a9\u06d2 \u0642\u0648\u0627\u0646\u06cc\u0646 \u06a9\u0627 \u0627\u0637\u0644\u0627\u0642 \u06a9\u0631\u062a\u0627 \u06c1\u06d2\u06d4<\/p>\n<\/li>\n<\/ul>\n<p><strong>5. \u0628\u06a9 \u06a9\u06cc\u067e\u0646\u06af:<\/strong><\/p>\n<pre><code class=\"language-python\">epoch_loss += loss.item()      # .item() extracts the Python float from the CUDA tensor (avoids GPU memory leak)\nglobal_step += 1               # Increment for LR schedule\npbar.set_postfix({'loss': ...})  # Update the tqdm progress bar display\n<\/code><\/pre>\n<h4 id=\"heading-6-validation\">6. \u062a\u0635\u062f\u06cc\u0642<\/h4>\n<pre><code class=\"language-python\">model.eval()                   # Disables dropout so we use full model capacity for honest evaluation\nval_loss = 0\nwith torch.no_grad():          # Disables gradient tracking, saves ~50% memory and runs faster\n    for input_ids, targets in val_loader:\n        input_ids, targets = input_ids.to(device), targets.to(device)\n        val_loss += model(input_ids, targets)['loss'].item()\nval_loss \/= len(val_loader)    # Average loss per batch\n<\/code><\/pre>\n<p>\u06cc\u06c1 \u0622\u067e \u06a9\u06d2 \u067e\u0627\u0633 \u0645\u0648\u062c\u0648\u062f \u0688\u06cc\u0679\u0627 \u06a9\u06cc \u062c\u0627\u0646\u0686 \u06a9\u0631\u062a\u0627 \u06c1\u06d2 \u062c\u0633 \u067e\u0631 \u0645\u0627\u0688\u0644 \u06a9\u0648 \u06a9\u0628\u06be\u06cc \u062a\u0631\u0628\u06cc\u062a \u0646\u06c1\u06cc\u06ba \u062f\u06cc \u06af\u0626\u06cc \u06c1\u06d2\u06d4 \u0679\u0631\u06cc\u0646 \u06a9\u06d2 \u0646\u0642\u0635\u0627\u0646\u0627\u062a \u0627\u0648\u0631 \u0642\u06cc\u0645\u062a \u06a9\u06d2 \u0646\u0642\u0635\u0627\u0646\u0627\u062a \u06a9\u0627 \u0645\u0648\u0627\u0632\u0646\u06c1 \u06a9\u0631\u0646\u06d2 \u0633\u06d2 \u062f\u0631\u062c \u0630\u06cc\u0644 \u067e\u062a\u06c1 \u0686\u0644\u062a\u0627 \u06c1\u06d2:<\/p>\n<table>\n<thead>\n<tr>\n<th>\u067e\u06cc\u0679\u0631\u0646<\/th>\n<th>\u0645\u0639\u0646\u06cc<\/th>\n<\/tr>\n<\/thead>\n<tbody>\n<tr>\n<td>\u062f\u0648\u0646\u0648\u06ba \u06a9\u0645\u06cc<\/td>\n<td>\u0645\u0627\u0688\u0644 \u0639\u0627\u0645 \u06a9\u0631\u0646\u06d2 \u06a9\u06d2 \u0642\u0627\u0628\u0644 \u0646\u0645\u0648\u0646\u0648\u06ba \u06a9\u0648 \u0633\u06cc\u06a9\u06be \u0631\u06c1\u0627 \u06c1\u06d2\u06d4<\/td>\n<\/tr>\n<tr>\n<td>\u0679\u0631\u06cc\u0646 \u2193\u060c \u0641\u0679 \u0627\u0633\u0679\u0627\u0644\/\u2191<\/td>\n<td>\u0627\u0648\u0648\u0631 \u0641\u0679\u0646\u06af: \u062d\u0641\u0638 \u06a9\u0631\u0646\u0627\u060c \u0633\u06cc\u06a9\u06be\u0646\u0627 \u0646\u06c1\u06cc\u06ba\u06d4<\/td>\n<\/tr>\n<tr>\n<td>\u0627\u0639\u0644\u06cc \u0627\u0648\u0631 \u0641\u0644\u06cc\u0679<\/td>\n<td>\u0627\u0646\u0688\u0631 \u0641\u0679\u0646\u06af: \u0645\u0627\u0688\u0644 \u06a9\u0648 \u0632\u06cc\u0627\u062f\u06c1 \u0635\u0644\u0627\u062d\u06cc\u062a \u06cc\u0627 \u0688\u06cc\u0679\u0627 \u06a9\u06cc \u0636\u0631\u0648\u0631\u062a \u06c1\u06d2\u06d4<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<p><code>model.eval()<\/code>    \u0686\u0648\u0646\u06a9\u06c1 \u0688\u0631\u0627\u067e \u0622\u0624\u0679 \u0622\u0641 \u06c1\u06d2\u060c \u0627\u0633 \u0644\u06cc\u06d2 \u067e\u0648\u0631\u06d2 \u0645\u0627\u0688\u0644 \u06a9\u0627 \u062c\u0627\u0626\u0632\u06c1 \u0644\u06cc\u0627 \u062c\u0627\u062a\u0627 \u06c1\u06d2\u06d4 <code>torch.no_grad()<\/code> \u0686\u0648\u0646\u06a9\u06c1 \u06c1\u0645 \u0635\u0631\u0641 \u067e\u06cc\u0645\u0627\u0626\u0634 \u06a9\u0631\u062a\u06d2 \u06c1\u06cc\u06ba\u060c \u0633\u06cc\u06a9\u06be\u062a\u06d2 \u0646\u06c1\u06cc\u06ba\u060c \u0627\u0633 \u0644\u06cc\u06d2 \u06c1\u0645 \u062a\u062f\u0631\u06cc\u062c\u06cc \u062d\u0633\u0627\u0628 \u06a9\u0648 \u0686\u06be\u0648\u0691 \u062f\u06cc\u062a\u06d2 \u06c1\u06cc\u06ba\u06d4<\/p>\n<h4 id=\"heading-7-checkpointing\">7. \u0686\u06cc\u06a9 \u067e\u0648\u0627\u0626\u0646\u0679<\/h4>\n<pre><code class=\"language-python\">if val_loss < best_val_loss:\n    best_val_loss = val_loss\n    torch.save(model.state_dict(), f\"{DRIVE_PATH}\/best_model.pt\")\n<\/code><\/pre>\n<p><code>model.state_dict()<\/code>    \u0648\u0627\u067e\u0633\u06cc <code>OrderedDict<\/code> \u067e\u06cc\u0631\u0627\u0645\u06cc\u0679\u0631 \u06a9\u06d2 \u0646\u0627\u0645 \u0679\u06cc\u0646\u0633\u0631 \u06a9\u06d2 \u0644\u06cc\u06d2 \u0646\u0642\u0634\u06c1 \u0628\u0646\u0627\u0626\u06cc\u06ba\u06d4 <code>torch.save<\/code> \u06c1\u0645 \u0627\u0633\u06d2 \u0688\u0633\u06a9 \u0645\u06cc\u06ba \u0633\u06cc\u0631\u06cc\u0644\u0627\u0626\u0632 \u06a9\u0631\u0646\u06d2 \u06a9\u06d2 \u0644\u06cc\u06d2 Python's pickle + zip \u0627\u0633\u062a\u0639\u0645\u0627\u0644 \u06a9\u0631\u062a\u06d2 \u06c1\u06cc\u06ba\u06d4 \u0635\u0631\u0641 \u0627\u0633 \u0635\u0648\u0631\u062a \u0645\u06cc\u06ba \u0628\u0686\u062a \u06a9\u0631\u06cc\u06ba \u062c\u0628 \u0642\u062f\u0631 \u06a9\u0627 \u0646\u0642\u0635\u0627\u0646 \u0628\u06c1\u062a\u0631 \u06c1\u0648\u06d4<\/p>\n<p>\u06cc\u06c1 \u06c1\u06d2 <strong>\u0627\u0628\u062a\u062f\u0627\u0626\u06cc \u062e\u0627\u062a\u0645\u06d2<\/strong> \u0630\u06c1\u0646\u06cc \u0637\u0648\u0631 \u067e\u0631: \u06c1\u0645 \u0627\u0646 \u0686\u06cc\u06a9 \u067e\u0648\u0627\u0626\u0646\u0679\u0633 \u06a9\u0648 \u0628\u0631\u0642\u0631\u0627\u0631 \u0631\u06a9\u06be\u062a\u06d2 \u06c1\u06cc\u06ba \u062c\u0648 \u0633\u0628 \u0633\u06d2 \u0628\u06c1\u062a\u0631 \u0639\u0627\u0645 \u0628\u0646\u0627\u062a\u06d2 \u06c1\u06cc\u06ba\u060c \u0642\u0637\u0639 \u0646\u0638\u0631 \u0627\u0633 \u06a9\u06d2 \u06a9\u06c1 \u0628\u0639\u062f \u06a9\u06d2 \u0627\u062f\u0648\u0627\u0631 \u0645\u06cc\u06ba \u06a9\u06cc\u0627 \u06c1\u0648\u062a\u0627 \u06c1\u06d2\u06d4<\/p>\n<h4 id=\"heading-summary-one-batch-in-6-steps\">\u062e\u0644\u0627\u0635\u06c1: 6 \u0645\u0631\u0627\u062d\u0644 \u0645\u06cc\u06ba \u0627\u06cc\u06a9 \u06c1\u06cc \u0628\u06cc\u0686<\/h4>\n<ol>\n<li>\n<p>\u0645\u0627\u0688\u0644 \u06a9\u06d2 \u0630\u0631\u06cc\u0639\u06d2 32 \u0627\u0631\u062f\u0648 \u062a\u0631\u062a\u06cc\u0628\u06cc\u06ba \u0641\u06cc\u0688 \u06a9\u0631\u06cc\u06ba \u2192 \u067e\u06cc\u0634\u06cc\u0646 \u06af\u0648\u0626\u06cc \u06a9\u06d2 \u0627\u0645\u06a9\u0627\u0646\u0627\u062a \u062d\u0627\u0635\u0644 \u06a9\u0631\u06cc\u06ba\u06d4<\/p>\n<\/li>\n<li>\n<p>\u06a9\u0631\u0627\u0633 \u0627\u06cc\u0646\u0679\u0631\u0648\u067e\u06cc \u0628\u0645\u0642\u0627\u0628\u0644\u06c1 \u0627\u0635\u0644 \u0627\u06af\u0644\u0627 \u0679\u0648\u06a9\u0646 \u2192 \u0627\u0633\u06a9\u06cc\u0644\u0631 \u0646\u0642\u0635\u0627\u0646 (\u06cc\u06c1 \u06a9\u062a\u0646\u0627 \u063a\u0644\u0637 \u06c1\u06d2\u061f)<\/p>\n<\/li>\n<li>\n<p>23M \u067e\u06cc\u0631\u0627\u0645\u06cc\u0679\u0631\u0632 \u06a9\u06d2 \u0630\u0631\u06cc\u0639\u06d2 \u0628\u06cc\u06a9 \u067e\u0631\u0648\u067e\u06cc\u06af\u06cc\u0634\u0646 \u2192 \u06af\u0631\u06cc\u0688\u06cc\u0646\u0679 \u0641\u06cc \u067e\u06cc\u0631\u0627\u0645\u06cc\u0679\u0631 (\u06a9\u06cc\u0627 \u062a\u0631\u0645\u06cc\u0645 \u06a9\u0631\u0646\u0627 \u06c1\u06d2\u061f)<\/p>\n<\/li>\n<li>\n<p>1.0 \u0633\u06d2 \u0646\u06cc\u0686\u06d2 \u06a9\u0644\u067e \u06af\u0631\u06cc\u0688\u06cc\u0646\u0679 \u0645\u0639\u06cc\u0627\u0631 \u2192 \u0639\u062f\u0645 \u0627\u0633\u062a\u062d\u06a9\u0627\u0645 \u06a9\u0648 \u0631\u0648\u06a9\u06cc\u06ba\u06d4<\/p>\n<\/li>\n<li>\n<p>\u0627\u06cc\u0688\u0645 \u0688\u0628\u0644\u06cc\u0648 \u067e\u06cc\u0631\u0627\u0645\u06cc\u0679\u0631\u0632 \u06a9\u0648 \u0645\u0648\u0645\u06cc\u0646\u0679\u0645 + \u0688\u06d2 \u2192 \u0627\u0635\u0644 \u0633\u06cc\u06a9\u06be\u0646\u06d2 \u06a9\u06d2 \u0637\u0648\u0631 \u067e\u0631 \u0627\u067e \u0688\u06cc\u0679 \u06a9\u0631\u062a\u0627 \u06c1\u06d2\u06d4<\/p>\n<\/li>\n<li>\n<p>5000 \u0628\u0627\u0631 \u062a\u06a9 \u062f\u06c1\u0631\u0627\u0626\u06cc\u06ba\u060c \u0628\u06c1\u062a\u0631\u06cc\u0646 \u0686\u06cc\u06a9 \u067e\u0648\u0627\u0626\u0646\u0679 \u0645\u062d\u0641\u0648\u0638 \u06a9\u0631\u06cc\u06ba \u2192 \u0645\u06a9\u0645\u0644<\/p>\n<\/li>\n<\/ol>\n<h3 id=\"heading-key-metrics\">\u0627\u06c1\u0645 \u0627\u0634\u0627\u0631\u06d2<\/h3>\n<p><strong>\u06a9\u0631\u0627\u0633 \u0627\u06cc\u0646\u0679\u0631\u0648\u067e\u06cc \u0646\u0642\u0635\u0627\u0646<\/strong> \u06cc\u06c1 \u067e\u06cc\u0645\u0627\u0626\u0634 \u06a9\u0631\u062a\u0627 \u06c1\u06d2 \u06a9\u06c1 \u067e\u06cc\u0634 \u06af\u0648\u0626\u06cc \u0634\u062f\u06c1 \u0627\u0645\u06a9\u0627\u0646\u06cc \u062a\u0642\u0633\u06cc\u0645 \u0627\u0635\u0644 \u0627\u06af\u0644\u06d2 \u0679\u0648\u06a9\u0646 \u0633\u06d2 \u06a9\u062a\u0646\u06cc \u062f\u0648\u0631 \u06c1\u06d2\u06d4 32,000 \u0633\u06d2 \u0632\u06cc\u0627\u062f\u06c1 \u0627\u0644\u0641\u0627\u0638 \u06a9\u06d2 \u0628\u06d2 \u062a\u0631\u062a\u06cc\u0628 \u0645\u0627\u0688\u0644 \u06a9\u0648 \u0646\u0642\u0635\u0627\u0646 \u06c1\u0648\u062a\u0627 \u06c1\u06d2 \u2252 ln(32000) \u2252 10.4\u06d4<\/p>\n<p><strong>\u0634\u0631\u0645\u0646\u062f\u06af\u06cc = \u0646\u0642\u0635\u0627\u0646<\/strong>\u0627\u0633 \u06a9\u06cc \u062a\u0634\u0631\u06cc\u062d \u0627\u0633 \u0637\u0631\u062d \u06a9\u06cc \u062c\u0627 \u0633\u06a9\u062a\u06cc \u06c1\u06d2 \u06a9\u06c1 \"\u0645\u0627\u0688\u0644 N \u06cc\u06a9\u0633\u0627\u06ba \u0645\u0645\u06a9\u0646\u06c1 \u0679\u0648\u06a9\u0646\u0632 \u0645\u06cc\u06ba \u0633\u06d2 \u0645\u0646\u062a\u062e\u0628 \u06a9\u0631\u062a\u0627 \u06c1\u06d2\"<\/p>\n<ul>\n<li>\n<p>\u067e\u06cc \u067e\u06cc \u0627\u06cc\u0644 32,000 = \u0628\u06d2 \u062a\u0631\u062a\u06cc\u0628 \u0627\u0646\u062f\u0627\u0632\u06c1<\/p>\n<\/li>\n<li>\n<p>PPL 100 = ~100 \u0627\u0645\u06cc\u062f\u0648\u0627\u0631\u0648\u06ba \u062a\u06a9 \u0645\u062d\u062f\u0648\u062f<\/p>\n<\/li>\n<li>\n<p>PPL 10 = \u0628\u06c1\u062a \u067e\u0631\u0627\u0639\u062a\u0645\u0627\u062f \u067e\u06cc\u0634\u06cc\u0646 \u06af\u0648\u0626\u06cc<\/p>\n<\/li>\n<\/ul>\n<p>\u0627\u06cc\u06a9 \u0628\u0627\u0631 \u062c\u0628 \u0679\u0631\u06cc\u0646\u0646\u06af \u0645\u06a9\u0645\u0644 \u06c1\u0648 \u062c\u0627\u0626\u06d2 \u0627\u0648\u0631 \u0645\u0627\u0688\u0644 \u0622\u067e \u06a9\u06cc \u0688\u0631\u0627\u0626\u06cc\u0648 \u0645\u06cc\u06ba \u0645\u062d\u0641\u0648\u0638 \u06c1\u0648 \u062c\u0627\u0626\u06d2\u060c \u0627\u06af\u0644\u0627 \u0645\u0631\u062d\u0644\u06c1 \u06cc\u06c1 \u06c1\u06d2 \u06a9\u06c1 \u0645\u0627\u0688\u0644 \u06a9\u0648 \u0627\u067e\u0646\u06d2 \u0644\u0648\u06a9\u0644 \u0633\u0633\u0679\u0645 \u0645\u06cc\u06ba \u0688\u0627\u0624\u0646 \u0644\u0648\u0688 \u06a9\u0631\u06cc\u06ba \u062a\u0627\u06a9\u06c1 \u062f\u0631\u062c \u0630\u06cc\u0644 \u0627\u0642\u062f\u0627\u0645\u0627\u062a \u06a9\u06cc\u06d2 \u062c\u0627 \u0633\u06a9\u06cc\u06ba:<\/p>\n<p>\u0627\u0628 \u0622\u067e \u06a9\u06d2 \u067e\u0627\u0633 \u0627\u06cc\u06a9 \u062a\u06cc\u0627\u0631 \u0645\u0627\u0688\u0644 \u06c1\u06d2\u060c \u0644\u06cc\u06a9\u0646 \u0627\u06cc\u06a9 \u0633\u0648\u0627\u0644 \u067e\u06cc\u062f\u0627 \u06c1\u0648\u062a\u0627 \u06c1\u06d2. \u06a9\u06cc\u0627 \u0622\u067e \u0627\u067e\u0646\u06d2 \u0645\u0627\u0688\u0644 \u06a9\u06d2 \u0633\u0627\u062a\u06be \u0686\u06cc\u0679 \u06a9\u0631\u0646\u06d2 \u06a9\u06d2 \u0644\u06cc\u06d2 \u062a\u06cc\u0627\u0631 \u06c1\u06cc\u06ba \u062c\u06cc\u0633\u0627 \u06a9\u06c1 \u0622\u067e AI \u0679\u0648\u0644 \u062c\u06cc\u0633\u06d2 ChatGPT\u060c Claude\u060c \u06cc\u0627 Copilot \u06a9\u06d2 \u0633\u0627\u062a\u06be \u06a9\u0631\u062a\u06d2 \u06c1\u06cc\u06ba\u061f \u062c\u0648\u0627\u0628 \u06c1\u06d2 <strong>\u0646\u06c1\u06cc\u06ba<\/strong>\u0645\u06cc\u06ba \u0627\u0628\u06be\u06cc \u062a\u06a9 \u062a\u06cc\u0627\u0631 \u0646\u06c1\u06cc\u06ba \u06c1\u0648\u06ba\u06d4 \u06a9\u06cc\u0648\u06ba<\/p>\n<p>\u062a\u0631\u0628\u06cc\u062a \u06a9\u0627 \u062d\u0635\u06c1 \u0645\u06a9\u0645\u0644 \u06c1\u0648 \u0686\u06a9\u0627 \u06c1\u06d2\u060c \u0644\u06cc\u06a9\u0646 \u0645\u06cc\u06ba \u0646\u06c1\u06cc\u06ba \u062c\u0627\u0646\u062a\u0627 \u06a9\u06c1 \u0627\u0633 \u06a9\u06cc \u0633\u0627\u062e\u062a \u06a9\u06cc\u0633\u06d2 \u0628\u0646\u0627\u0626\u06cc \u062c\u0627\u0626\u06d2 \u06cc\u0627 \u0627\u0633\u06d2 \u06af\u0641\u062a\u06af\u0648 \u06a9\u06d2 \u0627\u0646\u062f\u0627\u0632 \u0645\u06cc\u06ba \u0644\u06a9\u06be\u0646\u0627\u060c \u062c\u06cc\u0633\u06d2 \u0635\u0627\u0631\u0641 \u06a9\u06d2 \u0633\u0648\u0627\u0644\u0627\u062a \u06a9\u0627 \u062c\u0648\u0627\u0628 \u062f\u06cc\u0646\u0627\u06d4 \u06cc\u06c1 \u0648\u06c1 \u0642\u062f\u0645 \u06c1\u06d2 \u062c\u0633\u06d2 \u06c1\u0645 \u06a9\u06c1\u062a\u06d2 \u06c1\u06cc\u06ba\u06d4 <strong>\u0632\u06cc\u0631 \u0646\u06af\u0631\u0627\u0646\u06cc \u0641\u0627\u0626\u0646 \u0679\u06cc\u0648\u0646\u0646\u06af (SFT)<\/strong>.<\/p>\n<h2 id=\"heading-4-supervised-fine-tuning-sft\">4. \u0632\u06cc\u0631 \u0646\u06af\u0631\u0627\u0646\u06cc \u0641\u0627\u0626\u0646 \u0679\u06cc\u0648\u0646\u0646\u06af (SFT)<\/h2>\n<p>\u0628\u06c1\u062a \u0627\u0639\u0644\u06cc\u0670 \u0633\u0637\u062d \u067e\u0631\u060c SFT \u0627\u06cc\u06a9 \u0645\u0627\u0688\u0644 \u06a9\u0648 \u0633\u06a9\u06be\u0627\u062a\u0627 \u06c1\u06d2 \u06a9\u06c1 \u0633\u0648\u0627\u0644\u0627\u062a \u06a9\u0627 \u062c\u0648\u0627\u0628 \u06a9\u06cc\u0633\u06d2 \u062f\u06cc\u0627 \u062c\u0627\u0626\u06d2\u06d4 \u06cc\u06c1 \u0627\u06cc\u06a9 \u0645\u062b\u0627\u0644 \u062f\u06cc\u0646\u06d2 \u06a9\u06cc \u0637\u0631\u062d \u06c1\u06d2 \u062c\u0633 \u0633\u06d2 \u0622\u067e \u0633\u06cc\u06a9\u06be \u0633\u06a9\u062a\u06d2 \u06c1\u06cc\u06ba \u06a9\u06c1 \u06a9\u0633 \u0637\u0631\u062d \u062c\u0648\u0627\u0628 \u062f\u06cc\u0646\u0627 \u06c1\u06d2\u06d4 \u06c1\u0645\u0627\u0631\u06d2 \u067e\u0627\u0633 \u062c\u062a\u0646\u06d2 \u0632\u06cc\u0627\u062f\u06c1 \u06a9\u06cc\u0633\u0632 \u06c1\u0648\u06ba \u06af\u06d2\u060c \u062c\u0648\u0627\u0628 \u0627\u062a\u0646\u0627 \u06c1\u06cc \u0628\u06c1\u062a\u0631 \u06c1\u0648\u06af\u0627\u06d4 \u0644\u06c1\u0630\u0627 \u062c\u0648\u06c1\u0631 \u0645\u06cc\u06ba\u060c \u0632\u06cc\u0631 \u0646\u06af\u0631\u0627\u0646\u06cc \u0641\u0627\u0626\u0646 \u0679\u06cc\u0648\u0646\u0646\u06af \u0645\u0627\u0688\u0644 \u06a9\u0648 \u0627\u06cc\u06a9 \u0627\u0646\u0679\u0631\u0627\u06cc\u06a9\u0679\u0648 \u0627\u06cc\u062c\u0646\u0679 \u0645\u06cc\u06ba \u0628\u062f\u0644 \u062f\u06cc\u062a\u06cc \u06c1\u06d2\u06d4<\/p>\n<p>\u0627\u0633 \u06a9\u0648 \u062d\u0627\u0635\u0644 \u06a9\u0631\u0646\u06d2 \u06a9\u06d2 \u0644\u06cc\u06d2\u060c \u06c1\u0645 \u062f\u0631\u062c \u0630\u06cc\u0644 \u06a9\u0644\u06cc\u062f\u06cc \u062c\u0648\u0691\u06d2 \u0627\u0648\u0631 \u0641\u0627\u0631\u0645\u06cc\u0679 \u06a9\u0627 \u0627\u0633\u062a\u0639\u0645\u0627\u0644 \u06a9\u0631\u062a\u06d2 \u06c1\u0648\u0626\u06d2 \u0627\u06cc\u06a9 \u0645\u062b\u0627\u0644 \u0688\u06cc\u0679\u0627\u0633\u06cc\u0679 \u0628\u0646\u0627\u062a\u06d2 \u06c1\u06cc\u06ba:<\/p>\n<pre><code class=\"language-json\">{\n  \"conversations\": [\n    {\"role\": \"system\", \"content\": \"\u0622\u067e \u0627\u06cc\u06a9 \u0645\u062f\u062f\u06af\u0627\u0631 \u0627\u0631\u062f\u0648 \u0627\u0633\u0633\u0679\u0646\u0679 \u06c1\u06cc\u06ba\u06d4\"},\n    {\"role\": \"user\", \"content\": \"\u0633\u0648\u0627\u0644...\"},\n    {\"role\": \"assistant\", \"content\": \"\u062c\u0648\u0627\u0628...\"}\n  ]\n}\n<\/code><\/pre>\n<p>\u062a\u0642\u0631\u06cc\u0628\u0627\u064b <strong>79 \u0645\u062b\u0627\u0644\u06cc\u06ba\u06d4<\/strong> \u0627\u0633\u06d2 \u0633\u0633\u0679\u0645 \u0645\u06cc\u06ba \u0641\u06cc\u0688 \u06a9\u06cc\u0627 \u062c\u0627\u062a\u0627 \u06c1\u06d2 \u0627\u0648\u0631 JSONL \u0641\u0627\u0631\u0645\u06cc\u0679 \u0645\u06cc\u06ba \u0627\u0633\u0679\u0648\u0631 \u06a9\u06cc\u0627 \u062c\u0627\u062a\u0627 \u06c1\u06d2\u06d4 \u06c1\u0645 \u062d\u0642\u06cc\u0642\u06cc \u062f\u0646\u06cc\u0627 \u06a9\u06d2 \u0645\u0639\u0627\u0645\u0644\u0627\u062a \u0633\u06d2 \u0645\u0632\u06cc\u062f \u0645\u062b\u0627\u0644\u06cc\u06ba \u0627\u0633\u062a\u0639\u0645\u0627\u0644 \u06a9\u0631\u06cc\u06ba \u06af\u06d2\u06d4 \u062c\u06cc\u0633\u0627 \u06a9\u06c1 \u067e\u06c1\u0644\u06d2 \u06c1\u06cc \u0630\u06a9\u0631 \u06a9\u06cc\u0627 \u06af\u06cc\u0627 \u06c1\u06d2\u060c \u0622\u067e \u06a9\u06d2 \u067e\u0627\u0633 \u062c\u062a\u0646\u06cc \u0632\u06cc\u0627\u062f\u06c1 \u0645\u062b\u0627\u0644\u06cc\u06ba \u06c1\u0648\u06ba \u06af\u06cc\u060c \u0622\u067e \u06a9\u0648 \u0627\u062a\u0646\u06d2 \u06c1\u06cc \u0628\u06c1\u062a\u0631 \u0646\u062a\u0627\u0626\u062c \u0645\u0644\u06cc\u06ba \u06af\u06d2\u06d4<\/p>\n<h3 id=\"heading-formatting-conversations-for-training\">\u062a\u0631\u0628\u06cc\u062a \u06a9\u06d2 \u0644\u06cc\u06d2 \u06af\u0641\u062a\u06af\u0648 \u06a9\u0648 \u0641\u0627\u0631\u0645\u06cc\u0679 \u06a9\u0631\u0646\u0627<\/h3>\n<p>\u0627\u06af\u0644\u0627 \u0645\u0631\u062d\u0644\u06c1 \u062a\u0631\u0628\u06cc\u062a \u06a9\u06d2 \u0644\u06cc\u06d2 \u0627\u0648\u067e\u0631 \u0645\u062d\u0641\u0648\u0638 \u06a9\u06cc \u06af\u0626\u06cc \u06af\u0641\u062a\u06af\u0648 \u06a9\u0648 \u0641\u0627\u0631\u0645\u06cc\u0679 \u06a9\u0631\u0646\u0627 \u06c1\u06d2\u06d4 \u06cc\u06c1 SFT \u06af\u0641\u062a\u06af\u0648 \u06a9\u06cc \u0641\u0627\u0631\u0645\u06cc\u0679\u0646\u06af \u06a9\u0627 \u0645\u0631\u062d\u0644\u06c1 \u06c1\u06d2\u06d4 \u062e\u0627\u0645 \u06af\u0641\u062a\u06af\u0648 JSON \u06a9\u0648 \u0627\u0633\u062a\u0639\u0645\u0627\u0644 \u06a9\u0631\u062a\u06d2 \u06c1\u0648\u0626\u06d2 \u0679\u0648\u06a9\u0646 ID \u062a\u0631\u062a\u06cc\u0628 \u0645\u06cc\u06ba \u062a\u0628\u062f\u06cc\u0644 \u06a9\u0631\u06cc\u06ba: <strong>\u0646\u0642\u0635\u0627\u0646 \u0645\u0627\u0633\u06a9\u0646\u06af<\/strong>\u0644\u06c1\u0630\u0627\u060c \u0645\u0627\u0688\u0644 \u0635\u0631\u0641 \u06cc\u06c1 \u0633\u06cc\u06a9\u06be\u062a\u0627 \u06c1\u06d2 \u06a9\u06c1 \u06a9\u0633 \u0637\u0631\u062d \u0645\u0639\u0627\u0648\u0646 \u0631\u062f\u0639\u0645\u0644 \u067e\u06cc\u062f\u0627 \u06a9\u0631\u0646\u0627 \u06c1\u06d2\u06d4<\/p>\n<p>\u0646\u0642\u0635\u0627\u0646 \u06a9\u06cc \u0645\u0627\u0633\u06a9\u0646\u06af \u06a9\u0627 \u0645\u0637\u0644\u0628 \u06c1\u06d2 \u062c\u0627\u0646 \u0628\u0648\u062c\u06be \u06a9\u0631 \u062a\u0631\u0628\u06cc\u062a \u06a9\u06d2 \u0646\u0642\u0635\u0627\u0646 \u0633\u06d2 \u0627\u0646 \u067e\u0679 \u06a9\u06d2 \u06a9\u0686\u06be \u062d\u0635\u0648\u06ba \u06a9\u0648 \u0686\u06be\u067e\u0627\u0646\u0627\u06d4 \u0627\u0633 \u0635\u0648\u0631\u062a \u0645\u06cc\u06ba\u060c \u06c1\u0645 \u0633\u0633\u0679\u0645 \u067e\u0631\u0627\u0645\u067e\u0679\u0633 \u0627\u0648\u0631 \u0635\u0627\u0631\u0641 \u06a9\u06d2 \u067e\u06cc\u063a\u0627\u0645\u0627\u062a \u06a9\u0648 \u0645\u0627\u0633\u06a9 \u06a9\u0631\u062a\u06d2 \u06c1\u06cc\u06ba \u062a\u0627\u06a9\u06c1 \u0645\u0627\u0688\u0644 \u06a9\u0648 \u0627\u0646\u06c1\u06cc\u06ba \u06cc\u0627\u062f \u0631\u06a9\u06be\u0646\u06d2 \u06cc\u0627 \u062f\u0648\u0628\u0627\u0631\u06c1 \u0628\u0646\u0627\u0646\u06d2 \u06a9\u06cc \u062a\u0631\u0628\u06cc\u062a \u0646\u06c1 \u062f\u06cc \u062c\u0627\u0626\u06d2\u06d4 \u062a\u0631\u0628\u06cc\u062a \u06a9\u06d2 \u0627\u0634\u0627\u0631\u06d2 \u0635\u0631\u0641 \u0627\u0633\u0633\u0679\u0646\u0679 \u06a9\u06d2 \u062c\u0648\u0627\u0628\u0627\u062a \u0633\u06d2 \u0622\u062a\u06d2 \u06c1\u06cc\u06ba\u060c \u062c\u0648 \u0645\u0627\u0688\u0644 \u06a9\u0648 \u0633\u06a9\u06be\u0627\u0646\u06d2 \u06a9\u0627 \u0627\u06cc\u06a9 \u0645\u0641\u06cc\u062f \u062d\u0635\u06c1 \u06c1\u06d2 \u06a9\u06c1 \u06a9\u06cc\u0627 \u067e\u06cc\u062f\u0627 \u06a9\u0631\u0646\u0627 \u06c1\u06d2 \u0627\u0648\u0631 \u06a9\u0628 \u0631\u0648\u06a9\u0646\u0627 \u06c1\u06d2\u06d4<\/p>\n<h4 id=\"heading-part-1-disable-auto-formatting-amp-get-special-token-ids\">\u062d\u0635\u06c1 1: \u0622\u0679\u0648 \u0641\u0627\u0631\u0645\u06cc\u0679 \u06a9\u0648 \u063a\u06cc\u0631 \u0641\u0639\u0627\u0644 \u06a9\u0631\u06cc\u06ba \u0627\u0648\u0631 \u0627\u06cc\u06a9 \u062e\u0635\u0648\u0635\u06cc \u0679\u0648\u06a9\u0646 ID \u062d\u0627\u0635\u0644 \u06a9\u0631\u06cc\u06ba\u06d4<\/h4>\n<pre><code class=\"language-python\">tokenizer.no_padding()\n\nBOS_ID = tokenizer.token_to_id(\"<bos>\")       # 2\nEOS_ID = tokenizer.token_to_id(\"<eos>\")       # 3\nSEP_ID = tokenizer.token_to_id(\"<sep>\")       # 4\nPAD_ID = tokenizer.token_to_id(\"<pad>\")       # 0\nUSER_ID = tokenizer.token_to_id(\"<|user|>\")          # 5\nASSISTANT_ID = tokenizer.token_to_id(\"<|assistant|>\") # 6\nSYSTEM_ID = tokenizer.token_to_id(\"<|system|>\")       # 7\n\nIGNORE_INDEX = -100\n<\/pad><\/sep><\/eos><\/bos><\/code><\/pre>\n<ul>\n<li>\n<p><code>no_padding()<\/code><strong>:<\/strong> \u0679\u0648\u06a9\u0646\u0627\u0626\u0632\u0631 \u0633\u06d2 \u06a9\u06c1\u0648\u060c \"\u067e\u06cc\u0688\u0646\u06af \u062e\u0648\u062f\u06a9\u0627\u0631 \u0637\u0648\u0631 \u067e\u0631 \u0634\u0627\u0645\u0644 \u0646\u06c1 \u06a9\u0631\u06cc\u06ba\u06d4 \u0645\u06cc\u06ba \u062e\u0648\u062f \u06a9\u0631\u0648\u06ba \u06af\u0627\u06d4\" \u0679\u0648\u06a9\u0646 \u06a9\u06cc \u062a\u0631\u062a\u06cc\u0628 \u067e\u0631 \u0645\u06a9\u0645\u0644 \u06a9\u0646\u0679\u0631\u0648\u0644 \u06a9\u06cc \u0636\u0631\u0648\u0631\u062a \u06c1\u06d2\u06d4<\/p>\n<\/li>\n<li>\n<p>\u0622\u067e \u06c1\u0631 \u062e\u0635\u0648\u0635\u06cc \u0679\u0648\u06a9\u0646 \u06a9\u06cc \u0639\u062f\u062f\u06cc ID \u062d\u0627\u0635\u0644 \u06a9\u0631 \u0633\u06a9\u062a\u06d2 \u06c1\u06cc\u06ba \u0627\u0648\u0631 \u0627\u0633\u06d2 \u062f\u0633\u062a\u06cc \u0637\u0648\u0631 \u067e\u0631 \u0635\u062d\u06cc\u062d \u062c\u06af\u06c1 \u067e\u0631 \u062f\u0627\u062e\u0644 \u06a9\u0631 \u0633\u06a9\u062a\u06d2 \u06c1\u06cc\u06ba\u06d4<\/p>\n<\/li>\n<li>\n<p><code>IGNORE_INDEX = -100<\/code><strong>:<\/strong> \u067e\u0627\u0626\u06cc \u0679\u0627\u0631\u0686 <code>cross_entropy<\/code> \u0627\u0633 \u0645\u06cc\u06ba \u0628\u0644\u0679 \u0627\u0646 \u062e\u0635\u0648\u0635\u06cc\u0627\u062a \u06c1\u06cc\u06ba\u06d4 -100 \u067e\u0631 \u0633\u06cc\u0679 \u06a9\u06cc\u06d2 \u06af\u0626\u06d2 \u06a9\u0633\u06cc \u0628\u06be\u06cc \u0644\u06cc\u0628\u0644 \u06a9\u0648 \u0646\u0642\u0635\u0627\u0646 \u06a9\u06d2 \u062d\u0633\u0627\u0628 \u06a9\u062a\u0627\u0628 \u0645\u06cc\u06ba \u0686\u06be\u0648\u0691 \u062f\u06cc\u0627 \u062c\u0627\u062a\u0627 \u06c1\u06d2\u06d4 \u0627\u0633 \u0637\u0631\u062d \u0646\u0642\u0635\u0627\u0646 \u062f\u06c1 \u0645\u0627\u0633\u06a9\u0646\u06af \u06a9\u0648 \u0644\u0627\u06af\u0648 \u06a9\u06cc\u0627 \u062c\u0627\u062a\u0627 \u06c1\u06d2\u06d4<\/p>\n<\/li>\n<\/ul>\n<h4 id=\"heading-part-2-formatconversation-the-core-function\">\u062d\u0635\u06c1 2: <code>format_conversation()<\/code>: \u0628\u0646\u06cc\u0627\u062f\u06cc \u062e\u0635\u0648\u0635\u06cc\u0627\u062a<\/h4>\n<p>\u0627\u0633 \u0633\u06d2 \u0645\u06a9\u0627\u0644\u0645\u06d2 \u06a9\u06d2 \u0630\u0631\u06cc\u0639\u06d2 \u062f\u0648 \u0645\u062a\u0648\u0627\u0632\u06cc \u0627\u0646\u062a\u0638\u0627\u0645\u0627\u062a \u067e\u06cc\u062f\u0627 \u06c1\u0648\u062a\u06d2 \u06c1\u06cc\u06ba\u06d4<\/p>\n<pre><code class=\"language-plaintext\">input_ids: [BOS, SYSTEM, \u0622\u067e, \u0627\u06cc\u06a9, \u0645\u062f\u062f\u06af\u0627\u0631, ..., SEP, USER, \u067e\u0627\u06a9\u0633\u062a\u0627\u0646, \u06a9\u0627, ..., SEP, ASST, \u0627\u0633\u0644\u0627\u0645, \u0622\u0628\u0627\u062f, \u06c1\u06d2, EOS, PAD, PAD, ...]\nlabels:    [-100, -100, -100, -100, -100, ..., -100, -100, -100,    -100,..., -100, -100, \u0627\u0633\u0644\u0627\u0645, \u0622\u0628\u0627\u062f, \u06c1\u06d2, EOS, -100, -100, ...]\n<\/code><\/pre>\n<p><strong>\u0641\u0646\u06a9\u0634\u0646 \u06a9\u06d2 \u0627\u0646\u062f\u0631 \u0645\u0631\u062d\u0644\u06c1 \u0648\u0627\u0631 \u0648\u0636\u0627\u062d\u062a:<\/strong><\/p>\n<p>1. BOS \u06a9\u06d2 \u0633\u0627\u062a\u06be \u0634\u0631\u0648\u0639 \u06a9\u0631\u06cc\u06ba:<\/p>\n<pre><code class=\"language-python\">input_ids = [BOS_ID]\nlabels = [IGNORE_INDEX]    # Don't learn to predict BOS\n<\/code><\/pre>\n<p>2. \u06c1\u0631 \u0645\u0648\u0691 \u067e\u0631 \u0645\u0648\u0627\u062f \u06a9\u0648 \u0627\u0646\u06a9\u0648\u0688 \u06a9\u0631\u06cc\u06ba \u0627\u0648\u0631 \u062e\u0648\u062f \u0628\u062e\u0648\u062f \u0634\u0627\u0645\u0644 \u06a9\u0631\u062f\u06c1 BOS\/EOS \u06a9\u0648 \u06c1\u0679\u0627 \u062f\u06cc\u06ba\u06d4<\/p>\n<pre><code class=\"language-python\">content_ids = tokenizer.encode(content).ids\nif content_ids[0] == BOS_ID: content_ids = content_ids[1:]     # Remove if tokenizer auto-added\nif content_ids[-1] == EOS_ID: content_ids = content_ids[:-1]\n<\/code><\/pre>\n<p>\u0686\u0648\u0646\u06a9\u06c1 \u06c1\u0645 \u062f\u0633\u062a\u06cc \u0637\u0648\u0631 \u067e\u0631 \u062e\u0627\u0635 \u0679\u0648\u06a9\u0646\u0632 \u06a9\u0648 \u0635\u062d\u06cc\u062d \u062c\u06af\u06c1 \u067e\u0631 \u0631\u06a9\u06be\u062a\u06d2 \u06c1\u06cc\u06ba\u060c \u06c1\u0645\u06cc\u06ba \u0688\u067e\u0644\u06cc\u06a9\u06cc\u0679\u0633 \u0646\u06c1\u06cc\u06ba \u0686\u0627\u06c1\u06cc\u06d2\u060c \u0627\u0633 \u0644\u06cc\u06d2 \u06c1\u0645 \u0627\u0646\u06c1\u06cc\u06ba \u06c1\u0679\u0627 \u062f\u06cc\u062a\u06d2 \u06c1\u06cc\u06ba\u06d4<\/p>\n<p>3. \u06c1\u0631 \u06a9\u0631\u062f\u0627\u0631 \u06a9\u06d2 \u0644\u06cc\u06d2 \u0679\u0648\u06a9\u0646 \u06a9\u06cc \u062a\u0631\u062a\u06cc\u0628 \u0628\u0646\u0627\u0626\u06cc\u06ba\u06d4<\/p>\n<table>\n<thead>\n<tr>\n<th>\u06a9\u0631\u062f\u0627\u0631<\/th>\n<th>\u0679\u0648\u06a9\u0646 \u06a9\u06cc \u062a\u0631\u062a\u06cc\u0628<\/th>\n<th>\u0644\u06cc\u0628\u0644<\/th>\n<\/tr>\n<\/thead>\n<tbody>\n<tr>\n<td>\u0646\u0638\u0627\u0645<\/td>\n<td><code>[SYSTEM_ID] + content + [SEP_ID]<\/code><\/td>\n<td>\u062a\u0645\u0627\u0645 -100 (\u0646\u0642\u0627\u0628 \u067e\u0648\u0634)<\/td>\n<\/tr>\n<tr>\n<td>\u0635\u0627\u0631\u0641<\/td>\n<td><code>[USER_ID] + content + [SEP_ID]<\/code><\/td>\n<td>\u062a\u0645\u0627\u0645 -100 (\u0646\u0642\u0627\u0628 \u067e\u0648\u0634)<\/td>\n<\/tr>\n<tr>\n<td>\u0627\u0633\u0633\u0679\u0646\u0679<\/td>\n<td><code>[ASST_ID] + content + [EOS_ID]<\/code><\/td>\n<td><code>[-100] + content + [EOS_ID]<\/code><\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<p>\u0627\u0633\u0633\u0679\u0646\u0679 \u06a9\u06d2 \u0631\u0648\u0644 \u0679\u0648\u06a9\u0646 (<code><|assistant|><\/code>) \u062e\u0648\u062f \u0646\u0642\u0627\u0628 \u067e\u0648\u0634 \u06c1\u06d2 \u06a9\u06cc\u0648\u0646\u06a9\u06c1 \u06c1\u0645 \u0646\u06c1\u06cc\u06ba \u0686\u0627\u06c1\u062a\u06d2 \u06a9\u06c1 \u0645\u0627\u0688\u0644 \u06cc\u06c1 \u0633\u06cc\u06a9\u06be\u06d2 \u06a9\u06c1 \u067e\u06cc\u0634\u0646 \u06af\u0648\u0626\u06cc \u06a9\u06cc\u0633\u06d2 \u06a9\u06cc \u062c\u0627\u0626\u06d2\u06d4 \u062a\u0627\u06c1\u0645\u060c \u0627\u0635\u0644 \u062c\u0648\u0627\u0628\u06cc \u0645\u0648\u0627\u062f \u0627\u0648\u0631 <code><eos\/><\/code> \u0686\u0648\u0646\u06a9\u06c1 \u06c1\u0645\u0627\u0631\u06d2 \u067e\u0627\u0633 \u0644\u06cc\u0628\u0644 \u06c1\u06cc\u06ba\u060c \u0645\u0627\u0688\u0644 \u0633\u06cc\u06a9\u06be\u062a\u0627 \u06c1\u06d2:<\/p>\n<p>4. \u06a9\u0627\u0679 \u06a9\u0631 \u0628\u06be\u0631\u06cc\u06ba\u06d4<\/p>\n<pre><code class=\"language-python\">input_ids = input_ids[:max_len]          # Cut to 256 tokens max\npad_len = max_len - len(input_ids)\ninput_ids = input_ids + [PAD_ID] * pad_len\nlabels = labels + [IGNORE_INDEX] * pad_len   # Don't learn from padding either\n<\/code><\/pre>\n<p>\u0628\u06cc\u0686 \u0633\u06cc\u06a9\u06be\u0646\u06d2 \u06a9\u06d2 \u0644\u06cc\u06d2\u060c \u062a\u0645\u0627\u0645 \u062a\u0631\u062a\u06cc\u0628\u0648\u06ba \u06a9\u06cc \u0644\u0645\u0628\u0627\u0626\u06cc \u0627\u06cc\u06a9 \u06c1\u06cc \u06c1\u0648\u0646\u06cc \u0686\u0627\u06c1\u06cc\u06d2\u06d4 \u067e\u06cc\u0688\u0646\u06af \u0644\u06cc\u0628\u0644 -100 \u06c1\u06d2\u060c \u0644\u06c1\u0630\u0627 \u0646\u0642\u0635\u0627\u0646 \u06a9\u06cc \u0635\u0648\u0631\u062a \u0645\u06cc\u06ba \u0627\u0633\u06d2 \u0646\u0638\u0631 \u0627\u0646\u062f\u0627\u0632 \u06a9\u0631 \u062f\u06cc\u0627 \u062c\u0627\u062a\u0627 \u06c1\u06d2\u06d4<\/p>\n<p>\u06cc\u06c1\u0627\u06ba \u0645\u06a9\u0645\u0644 \u0627\u06cc\u06a9 \u06c1\u06d2\u06d4 <code>format_conversation()<\/code> \u0641\u0646\u06a9\u0634\u0646:<\/p>\n<pre><code class=\"language-python\">def format_conversation(conversation: dict, max_len: int = 256) -> dict:\n    \"\"\"\n    Convert a conversation dict into token IDs + labels for SFT.\n\n    Format: <bos><|system|>...<sep><|user|>...<sep><|assistant|>...<eos>\n    Labels: -100 for system\/user tokens (masked), actual IDs for assistant tokens.\n    \"\"\"\n    input_ids = [BOS_ID]\n    labels = [IGNORE_INDEX]\n\n    for turn in conversation[\"conversations\"]:\n        role = turn[\"role\"]\n        content = turn[\"content\"]\n\n        content_ids = tokenizer.encode(content).ids\n        if content_ids and content_ids[0] == BOS_ID:\n            content_ids = content_ids[1:]\n        if content_ids and content_ids[-1] == EOS_ID:\n            content_ids = content_ids[:-1]\n\n        if role == \"system\":\n            role_ids = [SYSTEM_ID] + content_ids + [SEP_ID]\n            role_labels = [IGNORE_INDEX] * len(role_ids)\n        elif role == \"user\":\n            role_ids = [USER_ID] + content_ids + [SEP_ID]\n            role_labels = [IGNORE_INDEX] * len(role_ids)\n        elif role == \"assistant\":\n            role_ids = [ASSISTANT_ID] + content_ids + [EOS_ID]\n            role_labels = [IGNORE_INDEX] + content_ids + [EOS_ID]\n\n        input_ids.extend(role_ids)\n        labels.extend(role_labels)\n\n    # Truncate and pad to max_len\n    input_ids = input_ids[:max_len]\n    labels = labels[:max_len]\n    pad_len = max_len - len(input_ids)\n    input_ids = input_ids + [PAD_ID] * pad_len\n    labels = labels + [IGNORE_INDEX] * pad_len\n\n    return {\"input_ids\": input_ids, \"labels\": labels}\n<\/eos><\/sep><\/sep><\/bos><\/code><\/pre>\n<h4 id=\"heading-part-3-verification\">\u062d\u0635\u06c1 3: \u062a\u0635\u062f\u06cc\u0642<\/h4>\n<pre><code class=\"language-python\">n_loss_tokens = sum(1 for l in test_formatted['labels'] if l != IGNORE_INDEX)\nprint(f\"  Tokens with loss: {n_loss_tokens} \/ 256\")\n<\/code><\/pre>\n<p>\u06cc\u06c1 \u0627\u0633 \u0628\u0627\u062a \u06a9\u06cc \u062a\u0635\u062f\u06cc\u0642 \u06a9\u0631\u062a\u0627 \u06c1\u06d2 \u06a9\u06c1 \u0679\u0648\u06a9\u0646 \u06a9\u0627 \u0635\u0631\u0641 \u0627\u06cc\u06a9 \u0686\u06be\u0648\u0679\u0627 \u0633\u0627 \u062d\u0635\u06c1 (\u0627\u0633\u0633\u0679\u0646\u0679 \u0627\u0633\u067e\u06cc\u0686 + EOS) \u0646\u0642\u0635\u0627\u0646 \u0645\u06cc\u06ba \u062d\u0635\u06c1 \u0688\u0627\u0644\u062a\u0627 \u06c1\u06d2\u06d4 \u0627\u06cc\u06a9 \u0639\u0627\u0645 \u0645\u062b\u0627\u0644 \u06a9\u06d2 \u0637\u0648\u0631 \u067e\u0631\u060c \u0622\u067e \u06a9\u0648 \u06a9\u0686\u06be \u0627\u0633 \u0637\u0631\u062d \u0646\u0638\u0631 \u0622 \u0633\u06a9\u062a\u0627 \u06c1\u06d2: <code>Tokens with loss: 18 \/ 256<\/code>\u0627\u0633 \u06a9\u0627 \u0645\u0637\u0644\u0628 \u06c1\u06d2 \u06a9\u06c1 \u0635\u0631\u0641 ~7% \u0633\u06cc\u06a9\u0648\u06cc\u0646\u0633\u0632 \u06af\u0631\u06cc\u0688\u06cc\u0646\u0679 \u0627\u067e \u0688\u06cc\u0679\u0633 \u0686\u0644\u0627\u062a\u06d2 \u06c1\u06cc\u06ba\u06d4 \u0628\u0627\u0642\u06cc (\u0633\u0633\u0679\u0645 \u067e\u0631\u0627\u0645\u067e\u0679\u0633\u060c \u0635\u0627\u0631\u0641 \u06a9\u06d2 \u0633\u0648\u0627\u0644\u0627\u062a\u060c \u062e\u0635\u0648\u0635\u06cc \u0679\u0648\u06a9\u0646\u060c \u067e\u06cc\u0688\u0646\u06af) \u0645\u0646\u062f\u0631\u062c\u06c1 \u0630\u06cc\u0644 \u0637\u0648\u0631 \u067e\u0631 \u0646\u0642\u0627\u0628 \u067e\u0648\u0634 \u06c1\u06cc\u06ba: <code>-100<\/code>.<\/p>\n<p>\u06cc\u06c1 SFT \u06a9\u0648 \u0628\u06c1\u062a \u0645\u0648\u062b\u0631 \u0628\u0646\u0627\u062a\u0627 \u06c1\u06d2\u06d4 100% \u0633\u06cc\u06a9\u06be\u0646\u06d2 \u06a9\u06d2 \u0633\u06af\u0646\u0644 \u0627\u0633\u0633\u0679\u0646\u0679 \u06a9\u06d2 \u0627\u0635\u0644 \u062c\u0648\u0627\u0628 \u06a9\u06cc \u067e\u06cc\u0634\u06cc\u0646 \u06af\u0648\u0626\u06cc \u06a9\u0631\u0646\u06d2 \u0627\u0648\u0631 \u06cc\u06c1 \u062c\u0627\u0646\u0646\u06d2 \u0633\u06d2 \u0622\u062a\u06d2 \u06c1\u06cc\u06ba \u06a9\u06c1 \u06a9\u0628 \u0631\u06a9\u0646\u0627 \u06c1\u06d2 (<code><eos\/><\/code>)\u06d4 \u06cc\u06c1 \u06a9\u0627\u0631\u06a9\u0631\u062f\u06af\u06cc \u062e\u0627\u0635 \u0637\u0648\u0631 \u067e\u0631 \u0627\u06c1\u0645 \u06c1\u06d2 \u062c\u0628 \u0635\u0631\u0641 79 \u062a\u0631\u0628\u06cc\u062a\u06cc \u0645\u062b\u0627\u0644\u06cc\u06ba \u06c1\u0648\u06ba\u06d4<\/p>\n<h3 id=\"heading-formatting-summary\">\u0641\u0627\u0631\u0645 \u06a9\u0627 \u062e\u0644\u0627\u0635\u06c1<\/h3>\n<table>\n<thead>\n<tr>\n<th>\u0639\u0646\u0635\u0631<\/th>\n<th>\u0645\u0642\u0635\u062f<\/th>\n<\/tr>\n<\/thead>\n<tbody>\n<tr>\n<td><code>no_padding()<\/code><\/td>\n<td>\u0679\u0648\u06a9\u0646 \u067e\u0644\u06cc\u0633\u0645\u0646\u0679 \u06a9\u0648 \u062f\u0633\u062a\u06cc \u0637\u0648\u0631 \u067e\u0631 \u06a9\u0646\u0679\u0631\u0648\u0644 \u06a9\u0631\u06cc\u06ba\u06d4<\/td>\n<\/tr>\n<tr>\n<td>\u062e\u0635\u0648\u0635\u06cc \u0679\u0648\u06a9\u0646 ID<\/td>\n<td>\u0639\u06cc\u0646 \u0645\u0637\u0627\u0628\u0642 \u0645\u0642\u0627\u0645\u0627\u062a \u067e\u0631 \u0686\u06cc\u0679 \u0688\u06be\u0627\u0646\u0686\u06c1 \u0645\u0627\u0631\u06a9\u0631 \u062f\u0627\u062e\u0644 \u06a9\u0631\u06cc\u06ba\u06d4<\/td>\n<\/tr>\n<tr>\n<td><code>IGNORE_INDEX = -100<\/code><\/td>\n<td>\u06a9\u06be\u0648\u0626\u06cc \u06c1\u0648\u0626\u06cc \u067e\u0648\u0632\u06cc\u0634\u0646\u0648\u06ba \u06a9\u0648 \u0686\u06be\u0648\u0691\u0646\u06d2 \u06a9\u06d2 \u0644\u06cc\u06d2 PyTorch \u06a9\u0627 \u0628\u0644\u0679 \u0627\u0646 \u0645\u06cc\u06a9\u0627\u0646\u0632\u0645<\/td>\n<\/tr>\n<tr>\n<td>\u0633\u0633\u0679\u0645\/\u06cc\u0648\u0632\u0631 \u0644\u06cc\u0628\u0644 \u2192 -100<\/td>\n<td>\u0627\u0633 \u0633\u06d2 \u0645\u062a \u0633\u06cc\u06a9\u06be\u0648 (\u0635\u0631\u0641 \u0633\u06cc\u0627\u0642 \u0648 \u0633\u0628\u0627\u0642)<\/td>\n<\/tr>\n<tr>\n<td>\u062b\u0627\u0646\u0648\u06cc \u0644\u06cc\u0628\u0644 \u2192 \u0627\u0635\u0644\u06cc ID<\/td>\n<td>\u062c\u0627\u0646\u06cc\u06ba \u06a9\u06c1 \u06a9\u06cc\u0633\u06d2 \u062c\u0648\u0627\u0628\u0627\u062a \u067e\u06cc\u062f\u0627 \u06a9\u06cc\u06d2 \u062c\u0627\u0626\u06cc\u06ba + \u06a9\u0628 \u0631\u0648\u06a9\u0646\u0627 \u06c1\u06d2\u06d4<\/td>\n<\/tr>\n<tr>\n<td>256 \u062a\u06a9 \u06a9\u0627\u0679 \u062f\u06cc\u0627 \u06af\u06cc\u0627\u06d4<\/td>\n<td>\u0645\u0627\u0688\u0644 \u06a9\u06cc \u0633\u06cc\u0627\u0642 \u0648 \u0633\u0628\u0627\u0642 \u06a9\u06cc \u06a9\u06be\u0691\u06a9\u06cc \u0633\u06d2 \u0645\u06cc\u0686 \u06a9\u0631\u06cc\u06ba\u06d4<\/td>\n<\/tr>\n<tr>\n<td>-100 \u0644\u06cc\u0628\u0644 \u06a9\u06d2 \u0633\u0627\u062a\u06be \u067e\u06cc\u0688\u0646\u06af<\/td>\n<td>\u0622\u0644\u0648\u062f\u06af\u06cc \u06a9\u06d2 \u0646\u0642\u0635\u0627\u0646\u0627\u062a \u06a9\u06d2 \u0628\u063a\u06cc\u0631 \u0628\u06cc\u0686 \u0686\u06be\u0627\u0646\u0679\u0646\u0627<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<h3 id=\"heading-sft-dataset-amp-dataloader\">SFT \u0688\u06cc\u0679\u0627\u0633\u06cc\u0679 \u0627\u0648\u0631 \u0688\u06cc\u0679\u0627 \u0644\u0648\u0688\u0631<\/h3>\n<pre><code class=\"language-python\">class SFTDataset(Dataset):\n    def __init__(self, conversations: list, max_len: int = 256):\n        self.examples = []\n        for conv in conversations:\n            formatted = format_conversation(conv, max_len)\n            self.examples.append(formatted)\n\n    def __len__(self):\n        return len(self.examples)\n\n    def __getitem__(self, idx):\n        return (\n            torch.tensor(self.examples[idx]['input_ids'], dtype=torch.long),\n            torch.tensor(self.examples[idx]['labels'], dtype=torch.long),\n        )\n<\/code><\/pre>\n<p>\u06cc\u06c1 \u062a\u0645\u0627\u0645 79 \u0641\u0627\u0631\u0645\u06cc\u0679 \u0634\u062f\u06c1 \u06af\u0641\u062a\u06af\u0648 \u06a9\u0648 PyTorch \u0688\u06cc\u0679\u0627\u0633\u06cc\u0679 \u0645\u06cc\u06ba \u0633\u0645\u06cc\u0679\u062a\u0627 \u06c1\u06d2\u06d4 \u0634\u0631\u0648\u0639 \u06a9\u0631\u0646\u06d2 \u067e\u0631\u060c \u062a\u0645\u0627\u0645 \u0628\u0627\u062a \u0686\u06cc\u062a \u06a9\u0648 \u067e\u06c1\u0644\u06d2 \u0633\u06d2 \u0641\u0627\u0631\u0645\u06cc\u0679 \u06a9\u06cc\u0627 \u062c\u0627\u062a\u0627 \u06c1\u06d2 \u0627\u0633 \u06a9\u0627 \u0627\u0633\u062a\u0639\u0645\u0627\u0644 \u06a9\u0631\u062a\u06d2 \u06c1\u0648\u0626\u06d2: <code>format_conversation()<\/code> \u0627\u0648\u0631 \u0646\u062a\u06cc\u062c\u06c1 \u0645\u062d\u0641\u0648\u0638 \u06a9\u0631\u06cc\u06ba\u06d4 \u062c\u0628 \u0688\u06cc\u0679\u0627 \u0644\u0648\u0688\u0631 \u06a9\u0633\u06cc \u0622\u0626\u0679\u0645 \u06a9\u06cc \u062f\u0631\u062e\u0648\u0627\u0633\u062a \u06a9\u0631\u062a\u0627 \u06c1\u06d2\u06d4 <code>idx<\/code>\u06cc\u06c1 \u0648\u0627\u067e\u0633 \u0622\u062a\u0627 \u06c1\u06d2 <code>(input_ids, labels)<\/code> \u0679\u06cc\u0646\u0633\u0631 \u06a9\u06d2 \u0633\u0627\u062a\u06be\u06d4<\/p>\n<p><strong>\u0688\u06cc\u0679\u0627 \u0644\u0648\u0688\u0631:<\/strong><\/p>\n<pre><code class=\"language-python\">sft_loader = DataLoader(sft_dataset, batch_size=4, shuffle=True)\n<\/code><\/pre>\n<ul>\n<li>\n<p><code>batch_size=4<\/code><strong>:<\/strong> \u0635\u0631\u0641 79 \u0645\u062b\u0627\u0644\u06cc\u06ba \u06c1\u06cc\u06ba\u060c \u0644\u06c1\u0630\u0627 \u06cc\u06c1 \u0627\u06cc\u06a9 \u0686\u06be\u0648\u0679\u0627 \u0633\u0627 \u0628\u06cc\u0686 \u06c1\u06d2\u06d4 \u0628\u0691\u06d2 \u0628\u06cc\u0686\u0648\u06ba \u06a9\u0648 \u0641\u06cc \u0632\u0645\u0627\u0646\u06c1 \u06a9\u0645 \u06af\u0631\u06cc\u0688\u06cc\u0646\u0679 \u0627\u067e \u0688\u06cc\u0679\u0633 \u06a9\u06cc \u0636\u0631\u0648\u0631\u062a \u06c1\u0648\u062a\u06cc \u06c1\u06d2\u06d4<\/p>\n<\/li>\n<li>\n<p><code>shuffle=True<\/code><strong>:<\/strong> \u06c1\u0645 \u06c1\u0631 \u062f\u0648\u0631 \u06a9\u06d2 \u0644\u06cc\u06d2 \u062a\u0631\u062a\u06cc\u0628 \u06a9\u0648 \u0628\u06d2 \u062a\u0631\u062a\u06cc\u0628 \u0628\u0646\u0627\u062a\u06d2 \u06c1\u06cc\u06ba \u062a\u0627\u06a9\u06c1 \u0645\u0627\u0688\u0644 \u06a9\u0648 \u0627\u06cc\u06a9 \u0645\u0642\u0631\u0631\u06c1 \u0645\u062b\u0627\u0644 \u06a9\u06cc \u062a\u0631\u062a\u06cc\u0628 \u06cc\u0627\u062f \u0646\u06c1 \u0631\u06c1\u06d2\u06d4<\/p>\n<\/li>\n<\/ul>\n<h3 id=\"heading-loading-the-pre-trained-model\">\u067e\u06c1\u0644\u06d2 \u0633\u06d2 \u062a\u0631\u0628\u06cc\u062a \u06cc\u0627\u0641\u062a\u06c1 \u0645\u0627\u0688\u0644 \u0644\u0648\u0688 \u06a9\u0631\u06cc\u06ba\u06d4<\/h3>\n<pre><code class=\"language-python\">model = UrduGPT(config).to(device)\ncheckpoint = torch.load(\"best_model.pt\", map_location=device)\nstate_dict = checkpoint['model_state_dict']\n\n# Name mapping (Colab \u2192 local)\nname_mapping = {\n    'token_emb.weight': 'token_embedding.weight',\n    'pos_emb.weight': 'position_embedding.weight',\n    'ln_f.weight': 'ln_final.weight',\n    'ln_f.bias': 'ln_final.bias',\n    'head.weight': 'lm_head.weight',\n}\n<\/code><\/pre>\n<p>\u0627\u0633 \u06a9\u06d2 \u0628\u0639\u062f \u0627\u06cc\u06a9 \u0646\u06cc\u0627 \u0627\u0631\u062f\u0648 \u062c\u06cc \u067e\u06cc \u0679\u06cc \u0645\u0627\u0688\u0644 \u062a\u06cc\u0627\u0631 \u06a9\u06cc\u0627 \u062c\u0627\u062a\u0627 \u06c1\u06d2 \u0627\u0648\u0631 \u0645\u0631\u062d\u0644\u06c1 3 \u0633\u06d2 \u067e\u06c1\u0644\u06d2 \u0633\u06d2 \u062a\u0631\u0628\u06cc\u062a \u06cc\u0627\u0641\u062a\u06c1 \u0648\u0632\u0646 \u06a9\u06d2 \u0633\u0627\u062a\u06be \u0644\u0648\u0688 \u06a9\u06cc\u0627 \u062c\u0627\u062a\u0627 \u06c1\u06d2\u06d4<\/p>\n<p>\u0622\u067e \u0634\u0627\u06cc\u062f \u0633\u0648\u0686 \u0631\u06c1\u06d2 \u06c1\u0648\u06ba \u06af\u06d2 \u06a9\u06c1 \u0622\u067e \u06a9\u0648 \u0646\u0627\u0645 \u06a9\u06cc \u0646\u0642\u0634\u06c1 \u0633\u0627\u0632\u06cc \u06a9\u06cc \u0636\u0631\u0648\u0631\u062a \u06a9\u06cc\u0648\u06ba \u06c1\u06d2\u06d4 \u0645\u0627\u0688\u0644 \u06a9\u0648 Google Colab \u0645\u06cc\u06ba \u0642\u062f\u0631\u06d2 \u0645\u062e\u062a\u0644\u0641 \u0645\u062a\u063a\u06cc\u0631 \u0646\u0627\u0645\u0648\u06ba \u06a9\u0627 \u0627\u0633\u062a\u0639\u0645\u0627\u0644 \u06a9\u0631\u062a\u06d2 \u06c1\u0648\u0626\u06d2 \u062a\u0631\u0628\u06cc\u062a \u062f\u06cc \u06af\u0626\u06cc \u062a\u06be\u06cc\u060c \u062c\u06cc\u0633\u06d2 <code>token_emb<\/code> \u0628\u0691\u0627 <code>token_embedding<\/code>)\u06d4 \u0645\u06cc\u067e\u0646\u06af \u0622\u067e \u06a9\u06d2 \u0645\u0642\u0627\u0645\u06cc \u06a9\u0648\u0688 \u0645\u06cc\u06ba Colab \u06a9\u06d2 \u0646\u0627\u0645 \u06a9\u06d2 \u06a9\u0646\u0648\u0646\u0634\u0646\u0632 \u06a9\u0648 \u06a9\u0646\u0648\u0646\u0634\u0646\u0632 \u0645\u06cc\u06ba \u062a\u0631\u062c\u0645\u06c1 \u06a9\u0631\u062a\u06cc \u06c1\u06d2\u06d4 <code>strict=False<\/code> \u06a9\u0648 <code>load_state_dict<\/code> \u0644\u0648\u0688\u0646\u06af \u0645\u0645\u06a9\u0646 \u06c1\u06d2 \u06cc\u06c1\u0627\u06ba \u062a\u06a9 \u06a9\u06c1 \u0627\u06af\u0631 \u06a9\u0686\u06be \u0686\u0627\u0628\u06cc\u0627\u06ba \u0628\u0627\u0644\u06a9\u0644 \u0645\u0645\u0627\u062b\u0644 \u0646\u06c1 \u06c1\u0648\u06ba\u06d4<\/p>\n<p>\u0627\u0633 \u06a9\u06d2 \u0639\u0644\u0627\u0648\u06c1\u060c \u06a9\u06cc\u0648\u06ba \u067e\u0631\u06cc \u0679\u0631\u06cc\u0646\u0646\u06af \u06a9\u06d2 \u0633\u0627\u062a\u06be \u0634\u0631\u0648\u0639 \u06a9\u0631\u06cc\u06ba\u061f \u0679\u06be\u06cc\u06a9 \u06c1\u06d2\u060c SFT \u067e\u0631\u06cc \u0679\u0631\u06cc\u0646\u0646\u06af \u067e\u0631 \u0628\u0646\u0627\u06cc\u0627 \u06af\u06cc\u0627 \u06c1\u06d2\u06d4 \u0645\u0627\u0688\u0644 \u0627\u0631\u062f\u0648 \u06af\u0631\u0627\u0645\u0631\u060c \u0627\u0644\u0641\u0627\u0638 \u0627\u0648\u0631 \u062d\u0642\u0627\u0626\u0642 \u06a9\u0648 \u067e\u06c1\u0644\u06d2 \u0633\u06d2 \u062c\u0627\u0646\u062a\u0627 \u06c1\u06d2\u06d4 SFT \u06af\u0641\u062a\u06af\u0648 \u06a9\u06d2 \u0641\u0627\u0631\u0645\u06cc\u0679\u0633 \u0633\u06a9\u06be\u0627\u062a\u0627 \u06c1\u06d2\u06d4 \u0628\u06d2 \u062a\u0631\u062a\u06cc\u0628 \u0648\u0632\u0646 \u06a9\u06d2 \u0633\u0627\u062a\u06be \u0634\u0631\u0648\u0639 \u06a9\u0631\u0646\u06d2 \u06a9\u06d2 \u0644\u06cc\u06d2 \u0628\u06c1\u062a \u0632\u06cc\u0627\u062f\u06c1 \u0688\u06cc\u0679\u0627 \u0627\u0648\u0631 \u062a\u0631\u0628\u06cc\u062a \u06a9\u06cc \u0636\u0631\u0648\u0631\u062a \u06c1\u0648\u062a\u06cc \u06c1\u06d2\u06d4<\/p>\n<h3 id=\"heading-sft-training-loop\">SFT \u0679\u0631\u06cc\u0646\u0646\u06af \u0644\u0648\u067e<\/h3>\n<p>\u0645\u06a9\u0645\u0644 SFT \u0679\u0631\u06cc\u0646\u0646\u06af \u0644\u0648\u067e \u0645\u0646\u062f\u0631\u062c\u06c1 \u0630\u06cc\u0644 \u06c1\u06d2:<\/p>\n<pre><code class=\"language-python\">SFT_LR = 2e-5\nSFT_EPOCHS = 50\noptimizer = torch.optim.AdamW(model.parameters(), lr=SFT_LR, weight_decay=0.01)\n\nsft_history = {'loss': []}\nbest_loss = float('inf')\n\nfor epoch in range(SFT_EPOCHS):\n    model.train()\n    epoch_loss = 0\n    n_batches = 0\n\n    for input_ids, labels in sft_loader:\n        input_ids = input_ids.to(device)\n        labels = labels.to(device)\n\n        outputs = model(input_ids)\n        logits = outputs['logits']\n\n        shift_logits = logits[:, :-1, :].contiguous()\n        shift_labels = labels[:, 1:].contiguous()\n\n        loss = F.cross_entropy(\n            shift_logits.view(-1, shift_logits.size(-1)),\n            shift_labels.view(-1),\n            ignore_index=IGNORE_INDEX,\n        )\n\n        optimizer.zero_grad(set_to_none=True)\n        loss.backward()\n        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)\n        optimizer.step()\n\n        epoch_loss += loss.item()\n        n_batches += 1\n\n    avg_loss = epoch_loss \/ n_batches\n    sft_history['loss'].append(avg_loss)\n\n    if avg_loss < best_loss:\n        best_loss = avg_loss\n        torch.save({\n            'model_state_dict': model.state_dict(),\n            'config': config.__dict__,\n            'epoch': epoch + 1,\n            'loss': avg_loss,\n        }, \"sft_model.pt\")\n\n    if (epoch + 1) % 10 == 0 or epoch == 0:\n        print(f\"Epoch {epoch+1}\/{SFT_EPOCHS} | Loss: {avg_loss:.4f}\")\n\nprint(f\"SFT complete! Best loss: {best_loss:.4f}\")\n<\/code><\/pre>\n<p>\u06cc\u06c1\u0627\u06ba \u06cc\u06c1 \u06c1\u06d2 \u06a9\u06c1 \u06cc\u06c1 \u06c1\u0627\u0626\u067e\u0631\u067e\u06cc\u0631\u0627\u0645\u06cc\u0679\u0631 \u067e\u0631\u06cc \u0679\u0631\u06cc\u0646\u0646\u06af \u0633\u06d2 \u0645\u062e\u062a\u0644\u0641 \u06a9\u06cc\u0648\u06ba \u06c1\u06cc\u06ba:<\/p>\n<table>\n<thead>\n<tr>\n<th>\u067e\u06cc\u0631\u0627\u0645\u06cc\u0679\u0631<\/th>\n<th>\u067e\u06cc\u0634\u06af\u06cc \u062a\u0631\u0628\u06cc\u062a<\/th>\n<th>\u0627\u06cc\u0633 \u0627\u06cc\u0641 \u0679\u06cc<\/th>\n<th>\u06cc\u06c1 \u0645\u062e\u062a\u0644\u0641 \u06a9\u06cc\u0648\u06ba \u06c1\u06d2\u061f<\/th>\n<\/tr>\n<\/thead>\n<tbody>\n<tr>\n<td>\u0633\u06cc\u06a9\u06be\u0646\u06d2 \u06a9\u06cc \u0634\u0631\u062d<\/td>\n<td>3e-4<\/td>\n<td>2e-5<\/td>\n<td>\u06a9\u0645 LR \u062a\u0628\u0627\u06c1 \u06a9\u0646 \u0628\u06be\u0648\u0644\u0646\u06d2 \u0633\u06d2 \u0631\u0648\u06a9\u062a\u0627 \u06c1\u06d2\u06d4 \u0627\u06c1\u0645 \u0627\u067e \u0688\u06cc\u0679 \u0622\u067e \u0646\u06d2 \u067e\u0631\u06cc \u0679\u0631\u06cc\u0646\u0646\u06af \u06a9\u06d2 \u062f\u0648\u0631\u0627\u0646 \u0633\u06cc\u06a9\u06be\u0627 \u06c1\u0648\u0627 \u0627\u0631\u062f\u0648 \u0639\u0644\u0645 \u062e\u062a\u0645 \u06a9\u0631 \u062f\u06d2 \u06af\u0627\u06d4<\/td>\n<\/tr>\n<tr>\n<td>\u0627\u0648\u0642\u0627\u062a<\/td>\n<td>3<\/td>\n<td>50<\/td>\n<td>\u0644\u0627\u06a9\u06be\u0648\u06ba \u0679\u0648\u06a9\u0646\u0632 \u06a9\u06d2 \u0645\u0642\u0627\u0628\u0644\u06d2 \u0645\u06cc\u06ba \u0635\u0631\u0641 79 \u0645\u062b\u0627\u0644\u06cc\u06ba \u06c1\u06cc\u06ba\u06d4 \u0645\u0627\u0688\u0644 \u06a9\u0648 \u06af\u0641\u062a\u06af\u0648 \u06a9\u06d2 \u0646\u0645\u0648\u0646\u06d2 \u0633\u06cc\u06a9\u06be\u0646\u06d2 \u0645\u06cc\u06ba \u0628\u06c1\u062a \u0633\u06d2 \u06af\u0632\u0631\u0646\u06d2 \u0644\u06af\u062a\u06d2 \u06c1\u06cc\u06ba\u06d4<\/td>\n<\/tr>\n<tr>\n<td>\u0648\u0632\u0646 \u0645\u06cc\u06ba \u06a9\u0645\u06cc<\/td>\n<td>0.1<\/td>\n<td>0.01<\/td>\n<td>\u0686\u0648\u0646\u06a9\u06c1 \u06c1\u0645 \u0686\u0627\u06c1\u062a\u06d2 \u06c1\u06cc\u06ba \u06a9\u06c1 \u0645\u0627\u0688\u0644 \u0627\u0646 \u0645\u062e\u0635\u0648\u0635 \u0645\u062b\u0627\u0644\u0648\u06ba \u06a9\u0648 \u0642\u0631\u06cc\u0628 \u0633\u06d2 \u0641\u0679 \u06a9\u0631\u06d2\u060c \u0627\u0633 \u0644\u06cc\u06d2 \u06a9\u0645 \u0631\u06cc\u06af\u0648\u0644\u0631\u0627\u0626\u0632\u06cc\u0634\u0646 \u06a9\u06cc \u0636\u0631\u0648\u0631\u062a \u06c1\u06d2\u06d4<\/td>\n<\/tr>\n<tr>\n<td>LR \u0634\u06cc\u0688\u0648\u0644<\/td>\n<td>\u06a9\u0648\u0633\u0627\u0626\u0646 \u0648\u0627\u0631\u0645 \u0627\u067e<\/td>\n<td>\u0644\u0627\u0645\u062a\u0646\u0627\u06c1\u06cc<\/td>\n<td>\u0686\u06be\u0648\u0679\u06d2 \u0688\u06cc\u0679\u0627 \u06a9\u0648 \u0679\u06be\u06cc\u06a9 \u06a9\u0631\u0646\u06d2 \u06a9\u06d2 \u0644\u06cc\u06d2 \u0622\u0633\u0627\u0646 \u0627\u0648\u0631 \u0645\u0648\u062b\u0631\u06d4<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<p>\u062a\u0631\u0628\u06cc\u062a \u06a9\u06d2 \u0645\u0631\u0627\u062d\u0644 \u062f\u0631\u062c \u0630\u06cc\u0644 \u06c1\u06cc\u06ba (\u0641\u06cc \u0628\u06cc\u0686):<\/p>\n<pre><code class=\"language-python\"># Forward pass with no targets; we compute loss manually\noutputs = model(input_ids)\nlogits = outputs['logits']\n\n# Shift for next-token prediction\nshift_logits = logits[:, :-1, :].contiguous()    # Predictions at positions 0..254\nshift_labels = labels[:, 1:].contiguous()         # Targets at positions 1..255\n\n# Loss with masking\nloss = F.cross_entropy(\n    shift_logits.view(-1, shift_logits.size(-1)),\n    shift_labels.view(-1),\n    ignore_index=IGNORE_INDEX,  # Skip -100 positions\n)\n<\/code><\/pre>\n<p>\u067e\u0631\u06cc \u0679\u0631\u06cc\u0646\u0646\u06af \u0633\u06d2 \u0628\u0646\u06cc\u0627\u062f\u06cc \u0641\u0631\u0642 \u06cc\u06c1 \u06c1\u06d2 \u06a9\u06c1 \u067e\u0631\u06cc \u0679\u0631\u06cc\u0646\u0646\u06af \u0645\u06cc\u06ba \u06c1\u062f\u0641 \u0628\u0631\u0627\u06c1 \u0631\u0627\u0633\u062a \u067e\u06c1\u0646\u0686\u0627\u06cc\u0627 \u062c\u0627\u062a\u0627 \u062a\u06be\u0627\u06d4 <code>model(input_ids, targets)<\/code> \u06c1\u0645 \u0646\u06d2 \u06c1\u0631 \u0679\u0648\u06a9\u0646 \u06a9\u06d2 \u0644\u06cc\u06d2 \u0627\u0646\u062f\u0631\u0648\u0646\u06cc \u0637\u0648\u0631 \u067e\u0631 \u0646\u0642\u0635\u0627\u0646 \u06a9\u0627 \u062d\u0633\u0627\u0628 \u0644\u06af\u0627\u06cc\u0627 \u06c1\u06d2\u06d4 \u06cc\u06c1\u0627\u06ba \u0622\u067e \u062f\u0633\u062a\u06cc \u0637\u0648\u0631 \u067e\u0631 \u0646\u0642\u0635\u0627\u0646 \u06a9\u0627 \u062d\u0633\u0627\u0628 \u0644\u06af\u0627 \u0633\u06a9\u062a\u06d2 \u06c1\u06cc\u06ba \u0627\u0648\u0631 \u0627\u0633\u062a\u0639\u0645\u0627\u0644 \u06a9\u0631 \u0633\u06a9\u062a\u06d2 \u06c1\u06cc\u06ba: <code>ignore_index=-100<\/code> \u063a\u06cc\u0631 \u0645\u0639\u0627\u0648\u0646 \u0645\u0642\u0627\u0645\u0627\u062a \u06a9\u0648 \u0645\u0627\u0633\u06a9 \u06a9\u0631\u0646\u06d2 \u06a9\u06d2 \u0644\u06cc\u06d2\u06d4<\/p>\n<p><strong>\u0634\u0641\u0679:<\/strong> <code>logits[:, :-1]<\/code>    \u0627\u0648\u0631 <code>labels[:, 1:]<\/code> \u062f\u0631\u062c \u0630\u06cc\u0644 \u0679\u0648\u06a9\u0646 \u067e\u06cc\u0634\u06cc\u0646 \u06af\u0648\u0626\u06cc \u06a9\u0648 \u0644\u0627\u06af\u0648 \u06a9\u0631\u06cc\u06ba: \u0645\u0642\u0627\u0645 \u067e\u0631 \u0645\u0627\u0688\u0644 \u06a9\u06cc \u067e\u06cc\u0634\u06cc\u0646 \u06af\u0648\u0626\u06cc\u0627\u06ba <code>i<\/code> \u0627\u0633 \u06a9\u0627 \u0645\u0648\u0627\u0632\u0646\u06c1 \u0645\u0642\u0627\u0645 \u067e\u0631 \u0645\u0648\u062c\u0648\u062f \u0627\u0635\u0644 \u0679\u0648\u06a9\u0646 \u0633\u06d2 \u06a9\u06cc\u0627 \u062c\u0627\u062a\u0627 \u06c1\u06d2\u06d4 <code>i+1<\/code>.<\/p>\n<p>\u0631\u06cc\u0648\u0631\u0633 \u067e\u0627\u0633 + \u0627\u067e \u0688\u06cc\u0679:<\/p>\n<pre><code class=\"language-python\">optimizer.zero_grad(set_to_none=True)\nloss.backward()\ntorch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)\noptimizer.step()\n<\/code><\/pre>\n<p>\u06cc\u06c1 \u067e\u0631\u06cc \u0679\u0631\u06cc\u0646\u0646\u06af \u062c\u06cc\u0633\u0627 \u06c1\u06cc \u06c1\u06d2\u06d4 \u0639\u062f\u0645 \u0627\u0633\u062a\u062d\u06a9\u0627\u0645 \u06a9\u0648 \u0631\u0648\u06a9\u0646\u06d2 \u06a9\u06d2 \u0644\u06cc\u06d2 \u06af\u0631\u06cc\u0688\u06cc\u0626\u0646\u0679\u0633 \u2192 \u0628\u06cc\u06a9 \u067e\u0631\u0648\u067e\u06cc\u06af\u06cc\u0634\u0646 \u2192 \u06a9\u0644\u067e \u2192 \u0627\u067e \u0688\u06cc\u0679 \u067e\u06cc\u0631\u0627\u0645\u06cc\u0679\u0631\u0632 \u06a9\u0648 \u0635\u0627\u0641 \u06a9\u0631\u06cc\u06ba\u06d4 1.0 \u067e\u0631 \u06af\u0631\u0627\u0688\u06cc\u0646\u0679 \u06a9\u0644\u067e\u0646\u06af \u06cc\u06c1\u0627\u06ba \u062e\u0627\u0635 \u0637\u0648\u0631 \u067e\u0631 \u0627\u06c1\u0645 \u06c1\u06d2 \u06a9\u06cc\u0648\u0646\u06a9\u06c1 \u0645\u0627\u0688\u0644 \u0679\u06be\u06cc\u06a9 \u06c1\u06d2 \u0627\u0648\u0631 \u06a9\u0686\u06be \u06af\u0631\u0627\u0688\u06cc\u0646\u0679 \u0686\u06be\u0648\u0679\u06d2 \u0688\u06cc\u0679\u0627 \u067e\u0631 \u0628\u0691\u06d2 \u06c1\u0648 \u0633\u06a9\u062a\u06d2 \u06c1\u06cc\u06ba\u06d4<\/p>\n<p>\u0686\u0648\u06a9\u06cc:<\/p>\n<pre><code class=\"language-python\">if avg_loss < best_loss:\n    torch.save({'model_state_dict': model.state_dict(), ...}, \"sft_model.pt\")\n<\/code><\/pre>\n<p>\u062c\u0628 \u0628\u06be\u06cc \u0622\u067e \u06a9\u06d2 \u062a\u0631\u0628\u06cc\u062a\u06cc \u0646\u0642\u0635\u0627\u0646 \u0645\u06cc\u06ba \u0628\u06c1\u062a\u0631\u06cc \u0622\u0626\u06d2 \u062a\u0648 \u0628\u0686\u062a \u06a9\u0631\u06cc\u06ba\u06d4 \u067e\u0631\u06cc \u0679\u0631\u06cc\u0646\u0646\u06af \u06a9\u06d2 \u0628\u0631\u0639\u06a9\u0633\u060c \u06a9\u0648\u0626\u06cc \u0627\u0644\u06af \u062a\u0648\u062b\u06cc\u0642 \u0633\u06cc\u0679 \u0646\u06c1\u06cc\u06ba \u06c1\u06d2 (79 \u0645\u062b\u0627\u0644\u06cc\u06ba \u062a\u0642\u0633\u06cc\u0645 \u06a9\u0631\u0646\u06d2 \u06a9\u06d2 \u0644\u06cc\u06d2 \u0628\u06c1\u062a \u06a9\u0645 \u06c1\u06cc\u06ba)\u060c \u0627\u0633 \u0644\u06cc\u06d2 \u06c1\u0645 \u062a\u0631\u0628\u06cc\u062a \u06a9\u06d2 \u0646\u0642\u0635\u0627\u0646 \u06a9\u0648 \u0686\u06cc\u06a9 \u06a9\u0631\u062a\u06d2 \u06c1\u06cc\u06ba\u06d4<\/p>\n<h3 id=\"heading-chat-function-inference\">\u0686\u06cc\u0679 \u06a9\u06cc \u062e\u0635\u0648\u0635\u06cc\u0627\u062a: \u0627\u0646\u062f\u0627\u0632\u06c1<\/h3>\n<p>\u0645\u06a9\u0645\u0644 \u0686\u06cc\u0679 \u06a9\u06cc \u062e\u0635\u0648\u0635\u06cc\u0627\u062a \u0645\u06cc\u06ba \u0634\u0627\u0645\u0644 \u06c1\u06cc\u06ba:<\/p>\n<pre><code class=\"language-python\">def chat(model, tokenizer, user_message: str, system_prompt: str = None,\n         max_tokens: int = 100, temperature: float = 0.7) -> str:\n    \"\"\"Generate a chat response.\"\"\"\n    model.eval()\n\n    if system_prompt is None:\n        system_prompt = SYSTEM_PROMPT\n\n    # Build the prompt\n    prompt_ids = [BOS_ID, SYSTEM_ID]\n\n    sys_ids = tokenizer.encode(system_prompt).ids\n    if sys_ids and sys_ids[0] == BOS_ID: sys_ids = sys_ids[1:]\n    if sys_ids and sys_ids[-1] == EOS_ID: sys_ids = sys_ids[:-1]\n    prompt_ids.extend(sys_ids)\n    prompt_ids.append(SEP_ID)\n\n    prompt_ids.append(USER_ID)\n    user_ids = tokenizer.encode(user_message).ids\n    if user_ids and user_ids[0] == BOS_ID: user_ids = user_ids[1:]\n    if user_ids and user_ids[-1] == EOS_ID: user_ids = user_ids[:-1]\n    prompt_ids.extend(user_ids)\n    prompt_ids.append(SEP_ID)\n\n    prompt_ids.append(ASSISTANT_ID)\n\n    # Generate\n    input_tensor = torch.tensor([prompt_ids], dtype=torch.long).to(device)\n    with torch.no_grad():\n        output_ids = model.generate(\n            input_tensor,\n            max_new_tokens=max_tokens,\n            temperature=temperature,\n            top_k=50,\n            top_p=0.9,\n            eos_token_id=EOS_ID,\n        )\n\n    # Decode only the generated part\n    generated_ids = output_ids[0][len(prompt_ids):].tolist()\n    if EOS_ID in generated_ids:\n        generated_ids = generated_ids[:generated_ids.index(EOS_ID)]\n\n    return tokenizer.decode(generated_ids)\n<\/code><\/pre>\n<p>\u06cc\u06c1\u0627\u06ba \u0645\u0631\u062d\u0644\u06c1 \u0648\u0627\u0631 \u062a\u062c\u0632\u06cc\u06c1 \u06c1\u06d2:<\/p>\n<p><strong>1. \u067e\u0631\u0627\u0645\u067e\u0679 \u0645\u06a9\u0645\u0644 \u06a9\u0631\u06cc\u06ba:<\/strong><\/p>\n<pre><code class=\"language-python\">prompt_ids = [BOS_ID, SYSTEM_ID]\nprompt_ids.extend(sys_ids)          # System prompt content\nprompt_ids.append(SEP_ID)\nprompt_ids.append(USER_ID)\nprompt_ids.extend(user_ids)          # User message content\nprompt_ids.append(SEP_ID)\nprompt_ids.append(ASSISTANT_ID)      # \"Now respond...\"\n<\/code><\/pre>\n<p>\u06cc\u06c1 \u0628\u0627\u0644\u06a9\u0644 \u0648\u06c1\u06cc \u0634\u06a9\u0644 \u062a\u0634\u06a9\u06cc\u0644 \u062f\u06cc\u062a\u0627 \u06c1\u06d2 \u062c\u0648 \u0645\u0627\u0688\u0644 \u0646\u06d2 SFT \u0679\u0631\u06cc\u0646\u0646\u06af \u06a9\u06d2 \u062f\u0648\u0631\u0627\u0646 \u062f\u06cc\u06a9\u06be\u0627 \u062a\u06be\u0627\u06d4<\/p>\n<pre><code class=\"language-plaintext\"><bos><|system|>\u0622\u067e \u0627\u06cc\u06a9 \u0645\u062f\u062f\u06af\u0627\u0631...<sep><|user|>\u067e\u0627\u06a9\u0633\u062a\u0627\u0646 \u06a9\u0627 \u062f\u0627\u0631\u0627\u0644\u062d\u06a9\u0648\u0645\u062a\u061f<sep><|assistant|>\n<\/sep><\/sep><\/bos><\/code><\/pre>\n<p>\u0645\u0627\u0688\u0644 \u0644\u06af \u0631\u06c1\u0627 \u06c1\u06d2\u06d4 <code><|assistant|><\/code> \u0627\u0648\u0631 \u0645\u06cc\u06ba \u062c\u0627\u0646\u062a\u0627 \u06c1\u0648\u06ba \u06a9\u06c1 SFT \u06a9\u06d2 \u062f\u0648\u0631\u0627\u0646 \u0645\u062c\u06be\u06d2 \"\u0627\u0628 \u0627\u06cc\u06a9 \u062c\u0648\u0627\u0628 \u067e\u06cc\u062f\u0627 \u06a9\u0631\u0646\u06d2 \u06a9\u06cc \u0636\u0631\u0648\u0631\u062a \u06c1\u06d2\"\u06d4 <code><|assistant|><\/code> \u06cc\u06c1 \u0648\u06c1\u06cc \u06c1\u06d2 \u062c\u0648 \u0627\u0633\u06d2 \u067e\u06cc\u062f\u0627 \u06a9\u0631\u0646\u0627 \u06c1\u06d2.<\/p>\n<p><strong>2. \u062e\u0648\u062f \u0628\u062e\u0648\u062f \u067e\u06cc\u062f\u0627 \u06a9\u0631\u06cc\u06ba:<\/strong><\/p>\n<pre><code class=\"language-python\">with torch.no_grad():\n    output_ids = model.generate(\n        input_tensor,\n        max_new_tokens=max_tokens,\n        temperature=temperature,\n        top_k=50,\n        top_p=0.9,\n        eos_token_id=EOS_ID,\n    )\n<\/code><\/pre>\n<ul>\n<li>\n<p><code>torch.no_grad()<\/code><strong>:<\/strong> \u0627\u0646\u0641\u0631\u0646\u0633 \u06a9\u06d2 \u0644\u06cc\u06d2 \u06af\u0631\u06cc\u0688\u06cc\u0626\u0646\u0679\u0633 \u06a9\u06cc \u0636\u0631\u0648\u0631\u062a \u0646\u06c1\u06cc\u06ba \u06c1\u06d2\u060c \u0645\u06cc\u0645\u0648\u0631\u06cc \u0627\u0648\u0631 \u0631\u0641\u062a\u0627\u0631 \u06a9\u0648 \u0628\u0686\u0627\u0646\u0627\u06d4<\/p>\n<\/li>\n<li>\n<p><code>temperature=0.7<\/code><strong>:<\/strong> \u0645\u0633\u0644\u0633\u0644\u060c \u0644\u06cc\u06a9\u0646 \u0642\u062f\u0631\u06d2 \u062a\u06cc\u0632 \u062a\u0642\u0633\u06cc\u0645\u060c \u0646\u06c1 \u06a9\u06c1 \u0631\u0648\u0628\u0648\u0679\u06a9 \u0622\u0624\u0679 \u067e\u0679\u06d4<\/p>\n<\/li>\n<li>\n<p><code>top_k=50<\/code><strong>:<\/strong> \u06a9\u0645 \u0627\u0645\u06a9\u0627\u0646 \u0634\u0648\u0631 \u0633\u06d2 \u0628\u0686\u0646\u06d2 \u06a9\u06d2 \u0644\u06cc\u06d2 \u06c1\u0645 \u0635\u0631\u0641 \u0679\u0627\u067e 50 \u0679\u0648\u06a9\u0646\u0632 \u0633\u06d2 \u0646\u0645\u0648\u0646\u06c1 \u0644\u06cc\u062a\u06d2 \u06c1\u06cc\u06ba\u06d4<\/p>\n<\/li>\n<li>\n<p><code>top_p=0.9<\/code><strong>:<\/strong> \u0646\u06cc\u0648\u06a9\u0644\u0626\u0633 \u0633\u06cc\u0645\u067e\u0644\u0646\u06af\u060c \u062c\u0648 \u0645\u062a\u062d\u0631\u06a9 \u0637\u0648\u0631 \u067e\u0631 0.9 \u06cc\u0627 \u0627\u0633 \u0633\u06d2 \u0632\u06cc\u0627\u062f\u06c1 \u06a9\u06d2 \u0645\u062c\u0645\u0648\u0639\u06cc \u0627\u0645\u06a9\u0627\u0646 \u06a9\u06d2 \u0633\u0627\u062a\u06be \u0679\u0648\u06a9\u0646 \u06a9\u06d2 \u0633\u0628 \u0633\u06d2 \u0686\u06be\u0648\u0679\u06d2 \u0633\u06cc\u0679 \u06a9\u0648 \u0645\u0646\u062a\u062e\u0628 \u06a9\u0631\u062a\u0627 \u06c1\u06d2\u06d4<\/p>\n<\/li>\n<li>\n<p><code>eos_token_id<\/code><strong>:<\/strong> \u062c\u0646\u0631\u06cc\u0634\u0646 \u0631\u06a9 \u062c\u0627\u062a\u06cc \u06c1\u06d2 \u0627\u06af\u0631: <code><eos\/><\/code> \u067e\u06cc\u062f\u0627 \u06a9\u06cc\u0627 \u062c\u0627\u062a\u0627 \u06c1\u06d2<\/p>\n<\/li>\n<\/ul>\n<p><strong>3. \u0646\u06a9\u0627\u0644\u0646\u0627 \u0627\u0648\u0631 \u0636\u0627\u0628\u0637\u06c1 \u06a9\u0634\u0627\u0626\u06cc \u06a9\u0631\u0646\u0627:<\/strong><\/p>\n<pre><code class=\"language-python\">generated_ids = output_ids[0][len(prompt_ids):].tolist()    # Only the new tokens\nif EOS_ID in generated_ids:\n    generated_ids = generated_ids[:generated_ids.index(EOS_ID)]  # Trim at EOS\nreturn tokenizer.decode(generated_ids)\n<\/code><\/pre>\n<p>\u067e\u0631\u0627\u0645\u067e\u0679 \u06a9\u0648 \u06a9\u0627\u0679 \u062f\u06cc\u06ba (\u0645\u06cc\u06ba \u062f\u0648\u0628\u0627\u0631\u06c1 \u0633\u0633\u0679\u0645 \u067e\u0631\u0627\u0645\u067e\u0679\u0633 \u0627\u0648\u0631 \u0635\u0627\u0631\u0641 \u06a9\u06d2 \u067e\u06cc\u063a\u0627\u0645\u0627\u062a \u0648\u0627\u067e\u0633 \u0646\u06c1\u06cc\u06ba \u06a9\u0631\u0646\u0627 \u0686\u0627\u06c1\u062a\u0627) <code><eos\/><\/code>\u0679\u0648\u06a9\u0646 \u0622\u0626\u06cc \u0688\u06cc \u06a9\u0648 \u062f\u0648\u0628\u0627\u0631\u06c1 \u0627\u0631\u062f\u0648 \u0679\u06cc\u06a9\u0633\u0679 \u0645\u06cc\u06ba \u0688\u06cc \u06a9\u0648\u0688 \u06a9\u0631\u06cc\u06ba\u06d4<\/p>\n<h2 id=\"heading-5-deployment\">5. \u062a\u0642\u0633\u06cc\u0645<\/h2>\n<p>\u0627\u0633 \u0648\u0642\u062a\u060c \u0622\u067e \u06a9\u0627 \u0627\u067e\u0646\u0627 \u0627\u06cc\u0644 \u0627\u06cc\u0644 \u0627\u06cc\u0645 \u06c1\u06d2\u06d4 \u06cc\u06c1 \u0627\u06cc\u06a9 \u0628\u06c1\u062a \u0628\u0691\u0627 \u0633\u0646\u06af \u0645\u06cc\u0644 \u06c1\u06d2\u06d4 \u0644\u06cc\u06a9\u0646 \u0627\u0628 \u0628\u06be\u06cc \u0627\u06cc\u06a9 \u06a9\u0644\u0627\u0633\u06a9 \u0645\u0633\u0626\u0644\u06c1 \u06c1\u06d2\u06d4 \"\u06cc\u06c1 \u0645\u06cc\u0631\u06d2 \u06a9\u0645\u067e\u06cc\u0648\u0679\u0631 \u067e\u0631 \u06a9\u0627\u0645 \u06a9\u0631\u062a\u0627 \u06c1\u06d2\u06d4\"<\/p>\n<p>\u0627\u067e\u0646\u06d2 \u0645\u0627\u0688\u0644 \u06a9\u0648 \u062f\u0648\u0633\u0631\u0648\u06ba \u06a9\u06d2 \u0644\u06cc\u06d2 \u062f\u0633\u062a\u06cc\u0627\u0628 \u06a9\u0631\u0646\u06d2 \u06a9\u06d2 \u0644\u06cc\u06d2\u060c \u0622\u067e \u06a9\u0648 \u0627\u0633\u06d2 \u062a\u0642\u0633\u06cc\u0645 \u06a9\u0631\u0646\u06d2 \u0627\u0648\u0631 \u0635\u0627\u0631\u0641\u06cc\u0646 \u06a9\u0648 \u0627\u0633 \u06a9\u06d2 \u0633\u0627\u062a\u06be \u062a\u0639\u0627\u0645\u0644 \u06a9\u0631\u0646\u06d2 \u06a9\u06d2 \u0644\u06cc\u06d2 \u0627\u06cc\u06a9 \u0627\u0646\u0679\u0631\u0641\u06cc\u0633 \u0641\u0631\u0627\u06c1\u0645 \u06a9\u0631\u0646\u06d2 \u06a9\u06cc \u0636\u0631\u0648\u0631\u062a \u06c1\u06d2\u06d4<\/p>\n<p>\u062a\u0639\u06cc\u0646\u0627\u062a\u06cc \u06a9\u06d2 \u0627\u062e\u062a\u06cc\u0627\u0631\u0627\u062a \u06a9\u06cc \u062a\u0644\u0627\u0634 \u06a9\u06d2 \u062f\u0648\u0631\u0627\u0646\u060c \u0645\u06cc\u06ba \u0646\u06d2 Gradio \u06a9\u0648 \u062f\u0631\u06cc\u0627\u0641\u062a \u06a9\u06cc\u0627\u060c \u062c\u0648 \u0645\u0634\u06cc\u0646 \u0644\u0631\u0646\u0646\u06af \u0645\u0627\u0688\u0644\u0632 \u0627\u0648\u0631 \u0627\u06cc\u067e\u0644\u06cc\u06a9\u06cc\u0634\u0646\u0632 \u06a9\u0648 \u062a\u0639\u06cc\u0646\u0627\u062a \u06a9\u0631\u0646\u06d2 \u06a9\u06d2 \u0644\u06cc\u06d2 \u0627\u06cc\u06a9 \u0633\u0627\u062f\u06c1 \u0627\u0648\u0631 \u0635\u0627\u0641 \u0627\u0646\u0679\u0631\u0641\u06cc\u0633 \u0641\u0631\u0627\u06c1\u0645 \u06a9\u0631\u062a\u0627 \u06c1\u06d2\u06d4 Gradio \u06a9\u0645 \u0633\u06d2 \u06a9\u0645 \u0633\u06cc\u0679 \u0627\u067e \u06a9\u06d2 \u0633\u0627\u062a\u06be \u0645\u0641\u062a \u06c1\u0648\u0633\u0679\u0646\u06af \u0641\u0631\u0627\u06c1\u0645 \u06a9\u0631\u062a\u06d2 \u06c1\u0648\u0626\u06d2\u060c Hugging Face Spaces \u06a9\u06d2 \u0633\u0627\u062a\u06be \u0628\u0631\u0627\u06c1 \u0631\u0627\u0633\u062a \u0636\u0645 \u06c1\u0648\u062a\u0627 \u06c1\u06d2\u06d4<\/p>\n<h3 id=\"heading-gradio-web-interface-apppy\">\u06af\u0631\u06cc\u0688\u06cc\u0648 \u0648\u06cc\u0628 \u0627\u0646\u0679\u0631\u0641\u06cc\u0633 (<code>app.py<\/code>)<\/h3>\n<p>\u06a9\u06c1 <code>app.py<\/code> \u0641\u0627\u0626\u0644\u06cc\u06ba \u06c1\u0631 \u0686\u06cc\u0632 \u06a9\u0648 \u062c\u0648\u0691 \u062f\u06cc\u062a\u06cc \u06c1\u06cc\u06ba\u06d4 \u0679\u0648\u06a9\u0646\u0627\u0626\u0632\u0631 \u0627\u0648\u0631 \u0645\u0627\u0688\u0644 \u0644\u0648\u0688 \u06a9\u0631\u06cc\u06ba\u06d4 <code>chat()<\/code> \u0641\u0646\u06a9\u0634\u0646 \u06a9\u0648 \u0627\u0646\u062c\u0627\u0645 \u062f\u06cc\u062a\u0627 \u06c1\u06d2 \u0627\u0648\u0631 Gradio UI \u0644\u0627\u0646\u0686 \u06a9\u0631\u062a\u0627 \u06c1\u06d2\u06d4 \u0645\u0627\u0688\u0644 \u0644\u0648\u0688\u0646\u06af \u0627\u0648\u0631 <code>chat()<\/code> \u0645\u0646\u0637\u0642 \u0648\u06c1\u06cc \u06c1\u06d2 \u062c\u0633 \u06a9\u0627 \u0627\u062d\u0627\u0637\u06c1 SFT \u0633\u06cc\u06a9\u0634\u0646 \u0645\u06cc\u06ba \u06a9\u06cc\u0627 \u06af\u06cc\u0627 \u06c1\u06d2\u060c \u0644\u06c1\u0630\u0627 \u06cc\u06c1\u0627\u06ba \u0635\u0631\u0641 Gradio \u06a9\u06d2 \u0645\u062e\u0635\u0648\u0635 \u062d\u0635\u06d2 \u062f\u06a9\u06be\u0627\u0626\u06d2 \u06af\u0626\u06d2 \u06c1\u06cc\u06ba\u06d4<\/p>\n<pre><code class=\"language-python\">import gradio as gr\n\ndef respond(message, history):\n    if not message.strip():\n        return \"\u0628\u0631\u0627\u06c1 \u06a9\u0631\u0645 \u06a9\u0686\u06be \u0644\u06a9\u06be\u06cc\u06ba\u06d4\"\n    return chat(message)\n\ndemo = gr.ChatInterface(\n    fn=respond,\n    title=\"&#x1f1f5;&#x1f1f0; \u0627\u0631\u062f\u0648 LLM \u0686\u06cc\u0679 \u0628\u0648\u0679\",\n    description=\"\"\"\n    ### \u0627\u06cc\u06a9 \u0686\u06be\u0648\u0679\u0627 \u0627\u0631\u062f\u0648 \u0632\u0628\u0627\u0646 \u0645\u0627\u0688\u0644 \u062c\u0648 \u0634\u0631\u0648\u0639 \u0633\u06d2 \u062a\u06cc\u0627\u0631 \u06a9\u06cc\u0627 \u06af\u06cc\u0627 \u06c1\u06d2\n    **A small Urdu language model built from scratch (~23M parameters)**\n    \"\"\",\n    examples=[\n        \"\u0627\u0644\u0633\u0644\u0627\u0645 \u0639\u0644\u06cc\u06a9\u0645\",\n        \"\u067e\u0627\u06a9\u0633\u062a\u0627\u0646 \u06a9\u0627 \u062f\u0627\u0631\u0627\u0644\u062d\u06a9\u0648\u0645\u062a \u06a9\u06cc\u0627 \u06c1\u06d2\u061f\",\n        \"\u0644\u0627\u06c1\u0648\u0631 \u06a9\u06d2 \u0628\u0627\u0631\u06d2 \u0645\u06cc\u06ba \u0628\u062a\u0627\u0626\u06cc\u06ba\u06d4\",\n        \"\u0628\u0631\u06cc\u0627\u0646\u06cc \u06a9\u06cc\u0633\u06d2 \u0628\u0646\u062a\u06cc \u06c1\u06d2\u061f\",\n        \"\u06a9\u0631\u06a9\u0679 \u06a9\u06cc\u0633\u06d2 \u06a9\u06be\u06cc\u0644\u06cc \u062c\u0627\u062a\u06cc \u06c1\u06d2\u061f\",\n        \"\u0686\u0627\u0646\u062f \u06a9\u06cc\u0633\u06d2 \u0686\u0645\u06a9\u062a\u0627 \u06c1\u06d2\u061f\",\n        \"\u0631\u0645\u0636\u0627\u0646 \u06a9\u06cc\u0627 \u06c1\u06d2\u061f\",\n        \"\u0639\u0644\u0627\u0645\u06c1 \u0627\u0642\u0628\u0627\u0644 \u06a9\u0648\u0646 \u062a\u06be\u06d2\u061f\",\n        \"\u062e\u0648\u0634 \u06a9\u06cc\u0633\u06d2 \u0631\u06c1\u06cc\u06ba\u061f\",\n        \"\u0622\u067e \u06a9\u0648\u0646 \u06c1\u06cc\u06ba\u061f\",\n    ],\n    theme=gr.themes.Soft(),\n)\n\nif __name__ == \"__main__\":\n    demo.launch()\n<\/code><\/pre>\n<ul>\n<li>\n<p><code>respond()<\/code>    \u0644\u06cc\u0628 <code>chat()<\/code> \u062e\u0627\u0644\u06cc \u0627\u0646 \u067e\u0679 \u06af\u0627\u0631\u0688 \u06a9\u0627 \u0627\u0633\u062a\u0639\u0645\u0627\u0644 \u06a9\u0631\u062a\u06d2 \u06c1\u0648\u0626\u06d2 Gradio \u06a9\u06d2 \u062f\u0633\u062a\u062e\u0637 \u0633\u06d2 \u0645\u06cc\u0644 \u06a9\u06be\u0627\u062a\u0627 \u06c1\u06d2\u06d4 <code>ChatInterface<\/code> \u0645\u06cc\u06ba \u0627\u0633 \u06a9\u0627 \u0645\u0646\u062a\u0638\u0631 \u06c1\u0648\u06ba\u06d4<\/p>\n<\/li>\n<li>\n<p><code>gr.ChatInterface<\/code>    \u06cc\u06c1 \u0645\u06cc\u0633\u062c \u06c1\u0633\u0679\u0631\u06cc\u060c \u0627\u0646 \u067e\u0679 \u0648\u0646\u0688\u0648\u060c \u0633\u06cc\u0646\u0688 \u0628\u0679\u0646 \u0648\u063a\u06cc\u0631\u06c1 \u06a9\u06d2 \u0633\u0627\u062a\u06be \u0627\u06cc\u06a9 \u0631\u06cc\u0688\u06cc \u0645\u06cc\u0688 \u0686\u06cc\u0679 UI \u0641\u0631\u0627\u06c1\u0645 \u06a9\u0631\u062a\u0627 \u06c1\u06d2\u06d4<\/p>\n<\/li>\n<li>\n<p><code>examples<\/code>    \u06cc\u06c1 \u0627\u06cc\u06a9 \u067e\u06c1\u0644\u06d2 \u0633\u06d2 \u0622\u0628\u0627\u062f \u067e\u06cc\u063a\u0627\u0645 \u06c1\u06d2 \u062c\u0633\u06d2 \u0635\u0627\u0631\u0641 \u0622\u0632\u0645\u0627\u0646\u06d2 \u06a9\u06d2 \u0644\u06cc\u06d2 \u06a9\u0644\u06a9 \u06a9\u0631 \u0633\u06a9\u062a\u06d2 \u06c1\u06cc\u06ba\u06d4<\/p>\n<\/li>\n<li>\n<p><code>theme=gr.themes.Soft()<\/code>    \u06cc\u06c1 \u0627\u06cc\u06a9 \u0635\u0627\u0641 \u0627\u0648\u0631 \u062c\u062f\u06cc\u062f \u0628\u0635\u0631\u06cc \u062a\u06be\u06cc\u0645 \u067e\u06cc\u0634 \u06a9\u0631\u062a\u0627 \u06c1\u06d2\u06d4<\/p>\n<\/li>\n<\/ul>\n<p><strong>\u0645\u06cc\u0645\u0648:<\/strong> \u06af\u0644\u06d2 \u0644\u06af\u0627\u0646\u0627 \u0686\u06c1\u0631\u06d2 \u06a9\u06cc \u062c\u06af\u06c1 \u0686\u0644\u0627\u0626\u06cc\u06ba\u06d4 <code>app.py<\/code> \u0686\u0648\u0646\u06a9\u06c1 \u06cc\u06c1 \u0627\u06cc\u06a9 \u0627\u0633\u0679\u06cc\u0646\u0688 \u0627\u0633\u06a9\u0631\u067e\u0679 \u06c1\u06d2\u060c \u0645\u06a9\u0645\u0644 <code>app.py<\/code> \u0630\u062e\u06cc\u0631\u06c1 \u06c1\u0631 \u0686\u06cc\u0632 \u06a9\u0648 \u0627\u06cc\u06a9 \u0641\u0627\u0626\u0644 \u0645\u06cc\u06ba \u0627\u0646 \u0644\u0627\u0626\u0646 \u06a9\u0631\u062a\u0627 \u06c1\u06d2: \u0645\u0627\u0688\u0644 \u06a9\u0646\u0641\u06cc\u06af\u0631\u06cc\u0634\u0646\u060c \u067e\u0648\u0631\u0627 \u0645\u062a\u0631\u062c\u0645 \u0641\u0646 \u062a\u0639\u0645\u06cc\u0631\u060c \u0627\u0648\u0631 \u0645\u0627\u0688\u0644 \u0644\u0648\u0688\u0646\u06af\u06d4 <code>gc.collect()<\/code> \u0645\u06cc\u0645\u0648\u0631\u06cc \u06a9\u06cc \u0627\u0635\u0644\u0627\u062d \u06a9\u06d2 \u0644\u06cc\u06d2 <code>chat()<\/code> \u062e\u0635\u0648\u0635\u06cc\u0627\u062a \u0627\u0648\u0631 \u0627\u0648\u067e\u0631 \u06af\u0631\u06cc\u0688\u06cc\u0648 \u0627\u0646\u0679\u0631\u0641\u06cc\u0633\u06d4<\/p>\n<p>\u06c1\u0645 \u0627\u0633\u06d2 \u067e\u06c1\u0644\u06d2 \u06c1\u06cc \u067e\u0631\u06cc \u0679\u0631\u06cc\u0646\u0646\u06af \u0627\u0648\u0631 SFT \u0633\u06cc\u06a9\u0634\u0646\u0632 \u0645\u06cc\u06ba \u06a9\u0648\u0631 \u06a9\u0631 \u0686\u06a9\u06d2 \u06c1\u06cc\u06ba\u060c \u0627\u0633 \u0644\u06cc\u06d2 \u0645\u06cc\u06ba \u06cc\u06c1\u0627\u06ba \u06cc\u06c1 \u0633\u0628 \u0646\u06c1\u06cc\u06ba \u062f\u06c1\u0631\u0627\u0624\u06ba \u06af\u0627\u06d4<\/p>\n<p><strong>\u0645\u0642\u0627\u0645\u06cc \u0637\u0648\u0631 \u067e\u0631 \u0686\u0644\u0627\u0626\u06cc\u06ba:<\/strong><\/p>\n<pre><code class=\"language-bash\">python app.py\n# Opens at http:\/\/127.0.0.1:7860\n<\/code><\/pre>\n<h3 id=\"heading-deployment-options\">\u062a\u0639\u06cc\u0646\u0627\u062a\u06cc \u06a9\u06d2 \u0627\u062e\u062a\u06cc\u0627\u0631\u0627\u062a<\/h3>\n<h4 id=\"heading-option-a-hugging-face-spaces-free-recommended\">\u0627\u062e\u062a\u06cc\u0627\u0631 A: \u0686\u06c1\u0631\u06d2 \u06a9\u06cc \u062c\u06af\u06c1 \u06a9\u0648 \u06af\u0644\u06d2 \u0644\u06af\u0627\u0646\u0627 (\u0645\u0641\u062a\u060c \u062a\u062c\u0648\u06cc\u0632 \u06a9\u0631\u062f\u06c1)<\/h4>\n<p>Hugging Face Spaces Gradio \u0627\u06cc\u067e\u0633 \u06a9\u06d2 \u0644\u06cc\u06d2 \u0645\u0641\u062a CPU \u06c1\u0648\u0633\u0679\u0646\u06af \u067e\u06cc\u0634 \u06a9\u0631\u062a\u0627 \u06c1\u06d2\u06d4<\/p>\n<p><strong>\u06a9\u06cc\u0627 \u0627\u067e \u0644\u0648\u0688 \u06a9\u0631\u0646\u0627 \u06c1\u06d2:<\/strong><\/p>\n<pre><code class=\"language-plaintext\">urdu-llm-chat\/\n\u251c\u2500\u2500 app.py                          # Gradio web interface\n\u251c\u2500\u2500 requirements.txt                # torch, tokenizers, gradio\n\u251c\u2500\u2500 README.md                       # Space metadata (sdk: gradio)\n\u251c\u2500\u2500 model\/\n\u2502   \u251c\u2500\u2500 __init__.py\n\u2502   \u251c\u2500\u2500 config.py\n\u2502   \u251c\u2500\u2500 transformer.py\n\u2502   \u2514\u2500\u2500 checkpoints\/sft_model.pt    # ~90MB trained model weights\n\u2514\u2500\u2500 tokenizer\/\n    \u2514\u2500\u2500 urdu_tokenizer\/\n        \u2514\u2500\u2500 urdu_bpe_tokenizer.json\n<\/code><\/pre>\n<p><strong>\u06cc\u06c1 \u06a9\u06cc\u0633\u06d2 \u06a9\u0627\u0645 \u06a9\u0631\u062a\u0627 \u06c1\u06d2:<\/strong><\/p>\n<ol>\n<li>\n<p>Huggingface.co \u067e\u0631 \u0627\u06cc\u06a9 \u0645\u0641\u062a \u0627\u06a9\u0627\u0624\u0646\u0679 \u0628\u0646\u0627\u0626\u06cc\u06ba<\/p>\n<\/li>\n<li>\n<p>\u0627\u06cc\u06a9 \u0646\u0626\u06cc \u062c\u06af\u06c1 \u0628\u0646\u0627\u0626\u06cc\u06ba (SDK: Gradio\u060c Hardware: CPU Basic)<\/p>\n<\/li>\n<li>\n<p>Git \u06a9\u06d2 \u0630\u0631\u06cc\u0639\u06d2 \u0641\u0627\u0626\u0644\u0648\u06ba \u06a9\u0648 \u067e\u0634 \u06a9\u0631\u06cc\u06ba\u06d4 <code>git clone https:\/\/huggingface.co\/spaces\/USERNAME\/urdu-llm-chat<\/code><\/p>\n<\/li>\n<li>\n<p>\u067e\u0631\u0648\u062c\u06cc\u06a9\u0679 \u0641\u0627\u0626\u0644\u0648\u06ba \u06a9\u0648 \u06a9\u0627\u067e\u06cc \u06a9\u0631\u06cc\u06ba \u0627\u0648\u0631 \u06a9\u0644\u0648\u0646 \u0634\u062f\u06c1 \u0630\u062e\u06cc\u0631\u06c1 \u0645\u06cc\u06ba \u062f\u06be\u06a9\u06cc\u0644\u06cc\u06ba\u06d4<\/p>\n<\/li>\n<li>\n<p>\u06c1\u06af\u0646\u06af \u0641\u06cc\u0633 \u062e\u0648\u062f \u0628\u062e\u0648\u062f \u0627\u0633 \u06a9\u06cc \u0627\u0646\u062d\u0635\u0627\u0631 \u06a9\u0648 \u0627\u0646\u0633\u0679\u0627\u0644 \u0627\u0648\u0631 \u0686\u0644\u0627\u062a\u0627 \u06c1\u06d2\u06d4 <code>app.py<\/code><\/p>\n<\/li>\n<li>\n<p>\u0622\u067e \u06a9\u0627 \u0645\u0627\u0688\u0644 \u06cc\u06c1\u0627\u06ba \u0634\u0627\u0626\u0639 \u06c1\u0648\u0627 \u06c1\u06d2: <code>https:\/\/huggingface.co\/spaces\/USERNAME\/urdu-llm-chat<\/code><\/p>\n<\/li>\n<\/ol>\n<p><strong>\u06a9\u06cc\u0648\u06ba CPUs \u0627\u0686\u06be\u06d2 \u06c1\u06cc\u06ba:<\/strong> \u06c1\u0645\u0627\u0631\u06d2 \u0645\u0627\u0688\u0644 \u0645\u06cc\u06ba \u0635\u0631\u0641 23M \u067e\u06cc\u0631\u0627\u0645\u06cc\u0679\u0631\u0632 (~90MB) \u06c1\u06cc\u06ba\u06d4 CPU \u067e\u0631 \u0627\u0646\u062f\u0627\u0632\u06c1 \u0644\u06af\u0627\u0646\u06d2 \u0645\u06cc\u06ba 1 \u0633\u06cc\u06a9\u0646\u0688 \u0633\u06d2 \u0628\u06be\u06cc \u06a9\u0645 \u0648\u0642\u062a \u0644\u06af\u062a\u0627 \u06c1\u06d2\u06d4 \u0688\u06cc\u0644\u06cc\u0648\u0631\u06cc \u06a9\u06d2 \u0644\u06cc\u06d2 GPU \u06a9\u06cc \u0636\u0631\u0648\u0631\u062a \u0646\u06c1\u06cc\u06ba \u06c1\u06d2\u06d4<\/p>\n<h4 id=\"heading-option-b-running-locally\">\u0622\u067e\u0634\u0646 B: \u0645\u0642\u0627\u0645\u06cc \u0637\u0648\u0631 \u067e\u0631 \u0686\u0644\u0627\u0626\u06cc\u06ba\u06d4<\/h4>\n<pre><code class=\"language-bash\">cd your-project-directory\npython -m venv .venv\nsource .venv\/bin\/activate\npip install -r requirements.txt\npython app.py\n<\/code><\/pre>\n<p>\u06cc\u06c1 \u0627\u06af\u0644\u06cc \u0628\u0627\u0631 \u06a9\u06be\u0644\u062a\u0627 \u06c1\u06d2\u06d4 <code>http:\/\/127.0.0.1:7860<\/code>. \u06cc\u06c1 Python 3.9 \u06cc\u0627 \u0627\u0633 \u0633\u06d2 \u0627\u0648\u067e\u0631 \u0648\u0627\u0644\u06d2 \u06a9\u0633\u06cc \u0628\u06be\u06cc \u06a9\u0645\u067e\u06cc\u0648\u0679\u0631 \u067e\u0631 \u06a9\u0627\u0645 \u06a9\u0631\u062a\u0627 \u06c1\u06d2\u06d4<\/p>\n<h4 id=\"heading-option-c-terminal-chat-no-ui\">\u0627\u062e\u062a\u06cc\u0627\u0631 C: \u0679\u0631\u0645\u06cc\u0646\u0644 \u0686\u06cc\u0679 (\u06a9\u0648\u0626\u06cc UI \u0646\u06c1\u06cc\u06ba)<\/h4>\n<p>\u0627\u06cc\u06a9 \u06c1\u0644\u06a9\u0627 \u067e\u06be\u0644\u06a9\u0627 \u0645\u062a\u0628\u0627\u062f\u0644 \u062c\u0633 \u0645\u06cc\u06ba \u06a9\u0648\u0626\u06cc \u06af\u0631\u06cc\u0688\u06cc\u0648 \u0627\u0646\u062d\u0635\u0627\u0631 \u0646\u06c1\u06cc\u06ba\u060c \u0635\u0631\u0641 \u0679\u0631\u0645\u06cc\u0646\u0644 \u0627\u0646 \u067e\u0679\/\u0622\u0624\u0679 \u067e\u0679\u06d4 \u0645\u0627\u0688\u0644 \u0644\u0648\u0688 \u06a9\u0631\u06cc\u06ba \u0627\u0648\u0631 \u0627\u06cc\u06a9 \u0627\u0646\u0679\u0631\u0627\u06cc\u06a9\u0679\u0648 \u0644\u0648\u067e \u062f\u0627\u062e\u0644 \u06a9\u0631\u06cc\u06ba\u06d4<\/p>\n<pre><code class=\"language-python\">\"\"\"\nStandalone Chat Inference Script for Urdu LLM\n\nUsage:\n    python inference\/chat.py\n\"\"\"\n\nimport sys\nimport torch\nfrom pathlib import Path\nfrom tokenizers import Tokenizer\n\n# Add project root to path\nPROJECT_ROOT = Path(__file__).resolve().parent.parent\nsys.path.insert(0, str(PROJECT_ROOT))\n\nfrom model.config import UrduLLMConfig\nfrom model.transformer import UrduGPT\n\n\ndef load_model(checkpoint_path: str, device: str = None):\n    \"\"\"Load the fine-tuned model.\"\"\"\n    if device is None:\n        if torch.cuda.is_available():\n            device = \"cuda\"\n        elif torch.backends.mps.is_available():\n            device = \"mps\"\n        else:\n            device = \"cpu\"\n\n    device = torch.device(device)\n\n    config = UrduLLMConfig()\n    model = UrduGPT(config).to(device)\n\n    checkpoint = torch.load(checkpoint_path, map_location=device)\n    model.load_state_dict(checkpoint['model_state_dict'])\n    model.eval()\n\n    return model, config, device\n\n\ndef chat_response(model, tokenizer, config, device, user_message,\n                  system_prompt=\"\u0622\u067e \u0627\u06cc\u06a9 \u0645\u062f\u062f\u06af\u0627\u0631 \u0627\u0631\u062f\u0648 \u0627\u0633\u0633\u0679\u0646\u0679 \u06c1\u06cc\u06ba\u06d4\",\n                  max_tokens=100, temperature=0.7):\n    \"\"\"Generate a chat response.\"\"\"\n    BOS_ID = tokenizer.token_to_id(\"<bos>\")\n    EOS_ID = tokenizer.token_to_id(\"<eos>\")\n    SEP_ID = tokenizer.token_to_id(\"<sep>\")\n    USER_ID = tokenizer.token_to_id(\"<|user|>\")\n    ASSISTANT_ID = tokenizer.token_to_id(\"<|assistant|>\")\n    SYSTEM_ID = tokenizer.token_to_id(\"<|system|>\")\n\n    # Build prompt\n    prompt_ids = [BOS_ID, SYSTEM_ID]\n\n    sys_ids = tokenizer.encode(system_prompt).ids\n    if sys_ids and sys_ids[0] == BOS_ID: sys_ids = sys_ids[1:]\n    if sys_ids and sys_ids[-1] == EOS_ID: sys_ids = sys_ids[:-1]\n    prompt_ids.extend(sys_ids)\n    prompt_ids.append(SEP_ID)\n\n    prompt_ids.append(USER_ID)\n    user_ids = tokenizer.encode(user_message).ids\n    if user_ids and user_ids[0] == BOS_ID: user_ids = user_ids[1:]\n    if user_ids and user_ids[-1] == EOS_ID: user_ids = user_ids[:-1]\n    prompt_ids.extend(user_ids)\n    prompt_ids.append(SEP_ID)\n\n    prompt_ids.append(ASSISTANT_ID)\n\n    # Generate\n    input_tensor = torch.tensor([prompt_ids], dtype=torch.long).to(device)\n    output_ids = model.generate(\n        input_tensor,\n        max_new_tokens=max_tokens,\n        temperature=temperature,\n        top_k=50,\n        top_p=0.9,\n        eos_token_id=EOS_ID,\n    )\n\n    generated_ids = output_ids[0][len(prompt_ids):].tolist()\n    if EOS_ID in generated_ids:\n        generated_ids = generated_ids[:generated_ids.index(EOS_ID)]\n\n    return tokenizer.decode(generated_ids)\n\n\ndef main():\n    print(\"=\" * 60)\n    print(\"&#x1f1f5;&#x1f1f0;  \u0627\u0631\u062f\u0648 LLM \u0686\u06cc\u0679 \u0628\u0648\u0679  &#x1f1f5;&#x1f1f0;\")\n    print(\"    Urdu LLM ChatBot\")\n    print(\"=\" * 60)\n\n    # Load model\n    tokenizer_path = PROJECT_ROOT \/ \"tokenizer\" \/ \"urdu_tokenizer\" \/ \"urdu_bpe_tokenizer.json\"\n\n    # Try SFT model first, fall back to pre-trained\n    sft_path = PROJECT_ROOT \/ \"model\" \/ \"checkpoints\" \/ \"sft_model.pt\"\n    pretrained_path = PROJECT_ROOT \/ \"model\" \/ \"checkpoints\" \/ \"best_model.pt\"\n\n    if sft_path.exists():\n        checkpoint_path = sft_path\n        print(\"Loading SFT (conversational) model...\")\n    elif pretrained_path.exists():\n        checkpoint_path = pretrained_path\n        print(\"Loading pre-trained model (not fine-tuned for chat)...\")\n    else:\n        print(\"&#x274c; No model checkpoint found!\")\n        print(\"   Run notebooks 03 and 04 first to train the model.\")\n        sys.exit(1)\n\n    model, config, device = load_model(str(checkpoint_path))\n    tokenizer = Tokenizer.from_file(str(tokenizer_path))\n\n    print(f\"Model loaded on {device}\")\n    print(\"\\nType your message in Urdu. Type 'quit' to exit.\\n\")\n    print(\"-\" * 60)\n\n    while True:\n        try:\n            user_input = input(\"\\n&#x1f464; \u0622\u067e: \").strip()\n        except (EOFError, KeyboardInterrupt):\n            print(\"\\n\u062e\u062f\u0627 \u062d\u0627\u0641\u0638! &#x1f44b;\")\n            break\n\n        if user_input.lower() in ['quit', 'exit', 'q']:\n            print(\"\u062e\u062f\u0627 \u062d\u0627\u0641\u0638! &#x1f44b;\")\n            break\n\n        if not user_input:\n            continue\n\n        response = chat_response(model, tokenizer, config, device, user_input)\n        print(f\"&#x1f916; \u0628\u0648\u0679: {response}\")\n\n\nif __name__ == \"__main__\":\n    main()\n<\/sep><\/eos><\/bos><\/code><\/pre>\n<p>\u0627\u0633\u06d2 \u0627\u0633\u062a\u0639\u0645\u0627\u0644 \u06a9\u0631\u062a\u06d2 \u06c1\u0648\u0626\u06d2 \u0686\u0644\u0627\u0626\u06cc\u06ba:<\/p>\n<pre><code class=\"language-bash\">python inference\/chat.py\n<\/code><\/pre>\n<pre><code class=\"language-plaintext\">&#x1f464; \u0622\u067e: \u0627\u0644\u0633\u0644\u0627\u0645 \u0639\u0644\u06cc\u06a9\u0645\n&#x1f916; \u0628\u0648\u0679: \u0648\u0639\u0644\u06cc\u06a9\u0645 \u0627\u0644\u0633\u0644\u0627\u0645! \u0645\u06cc\u06ba \u0622\u067e \u06a9\u06cc \u06a9\u06cc\u0627 \u0645\u062f\u062f \u06a9\u0631 \u0633\u06a9\u062a\u0627 \u06c1\u0648\u06ba\u061f\n<\/code><\/pre>\n<h2 id=\"heading-full-pipeline-summary\">\u0645\u06a9\u0645\u0644 \u067e\u0627\u0626\u067e \u0644\u0627\u0626\u0646 \u06a9\u0627 \u062e\u0644\u0627\u0635\u06c1<\/h2>\n<table>\n<thead>\n<tr>\n<th>\u0642\u062f\u0645<\/th>\n<th>\u0648\u0636\u0627\u062d\u062a<\/th>\n<\/tr>\n<\/thead>\n<tbody>\n<tr>\n<td><strong>\u0645\u0631\u062d\u0644\u06c1 1<\/strong><\/td>\n<td>\u062e\u0627\u0645 \u0627\u0631\u062f\u0648 \u0645\u062a\u0646 \u2192 \u0635\u0627\u0641 \u06a9\u0627\u0631\u067e\u0633<\/td>\n<\/tr>\n<tr>\n<td><strong>\u0645\u0631\u062d\u0644\u06c1 2<\/strong><\/td>\n<td>\u06a9\u0627\u0631\u067e\u0633 \u2192 BPE \u0679\u0648\u06a9\u0646\u0627\u0626\u0632\u0631 (32K \u0630\u062e\u06cc\u0631\u06c1 \u0627\u0644\u0641\u0627\u0638)<\/td>\n<\/tr>\n<tr>\n<td><strong>\u0645\u0631\u062d\u0644\u06c1 3<\/strong><\/td>\n<td>\u0679\u0648\u06a9\u0646\u0627\u0626\u0632\u0688 \u062d\u0635\u06c1 \u2192 GPT \u067e\u0631\u06cc \u0679\u0631\u06cc\u0646\u0646\u06af (23M \u067e\u06cc\u0631\u0627\u0645\u06cc\u0679\u0631\u0632\u060c \u0627\u06af\u0644\u06cc \u0679\u0648\u06a9\u0646 \u067e\u06cc\u0634\u0646 \u06af\u0648\u0626\u06cc)<\/td>\n<\/tr>\n<tr>\n<td><strong>\u0645\u0631\u062d\u0644\u06c1 4<\/strong><\/td>\n<td>\u0628\u0627\u062a \u0686\u06cc\u062a \u2192 \u0646\u0642\u0635\u0627\u0646 \u06a9\u06cc \u0645\u0627\u0633\u06a9\u0646\u06af \u06a9\u06d2 \u0633\u0627\u062a\u06be SFT \u2192 \u0686\u06cc\u0679 \u0645\u0627\u0688\u0644<\/td>\n<\/tr>\n<tr>\n<td><strong>\u062a\u0642\u0633\u06cc\u0645<\/strong><\/td>\n<td><code>app.py<\/code>    + \u0645\u0627\u0688\u0644 \u0648\u0632\u0646 \u2192 \u06af\u0644\u06d2 \u0644\u06af\u0627\u0626\u06cc\u06ba \u0686\u06c1\u0631\u06d2 \u06a9\u06cc \u062c\u06af\u06c1\/\u0645\u0642\u0627\u0645\u06cc<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<p>\u0622\u067e \u062a\u0639\u06cc\u0646\u0627\u062a \u06a9\u0631\u062f\u06c1 \u0645\u0627\u0688\u0644 \u06a9\u0648 \u06cc\u06c1\u0627\u06ba \u062f\u06cc\u06a9\u06be \u0633\u06a9\u062a\u06d2 \u06c1\u06cc\u06ba: https:\/\/huggingface.co\/spaces\/Wisamul\/urdu_llm<\/p>\n<h2 id=\"heading-results\">\u0646\u062a\u06cc\u062c\u06c1<\/h2>\n<p>\u0646\u062a\u0627\u0626\u062c \u06a9\u0627\u0641\u06cc \u062f\u0644\u0686\u0633\u067e \u062a\u06be\u06d2\u06d4 \u0686\u0648\u0646\u06a9\u06c1 \u06c1\u0645\u0627\u0631\u06d2 \u067e\u0627\u0633 \u0627\u06cc\u06a9 \u0686\u06be\u0648\u0679\u0627 \u0633\u0627 \u062a\u0631\u0628\u06cc\u062a\u06cc \u0646\u0645\u0648\u0646\u06c1 \u062a\u06be\u0627\u060c \u0627\u0633 \u0644\u06cc\u06d2 \u0641\u0631\u06cc\u0628 \u06a9\u0627\u0631\u06cc \u06a9\u0627 \u0627\u0645\u06a9\u0627\u0646 \u06a9\u0627\u0641\u06cc \u0632\u06cc\u0627\u062f\u06c1 \u062a\u06be\u0627\u060c \u0627\u0648\u0631 \u0622\u067e \u0641\u0631\u06cc\u0628 \u06a9\u06d2 \u062a\u0641\u0631\u06cc\u062d\u06cc \u062d\u0635\u06d2 \u06a9\u06cc \u062a\u0639\u0631\u06cc\u0641 \u06a9\u0631\u06cc\u06ba \u06af\u06d2\u060c \u062e\u0627\u0635 \u0637\u0648\u0631 \u067e\u0631 \u0627\u06af\u0631 \u0622\u067e \u0627\u0631\u062f\u0648 \u0633\u0645\u062c\u06be \u0633\u06a9\u062a\u06d2 \u06c1\u0648\u06ba\u06d4 \u0644\u06cc\u06a9\u0646 \u0645\u062c\u0645\u0648\u0639\u06cc \u0637\u0648\u0631 \u067e\u0631 \u0627\u0633 \u0646\u06d2 \u0627\u0686\u06be\u0627 \u06a9\u0627\u0645 \u06a9\u06cc\u0627\u06d4<\/p>\n<p>\u06cc\u06c1\u0627\u06ba \u062a\u0639\u06cc\u0646\u0627\u062a \u0645\u0627\u0688\u0644 \u06a9\u06d2 \u0633\u0627\u062a\u06be \u06a9\u0686\u06be \u0646\u0645\u0648\u0646\u06d2 \u06a9\u06d2 \u062a\u0639\u0627\u0645\u0644\u0627\u062a \u06c1\u06cc\u06ba:<\/p>\n<pre><code class=\"language-plaintext\">&#x1f464; User: \u0627\u0644\u0633\u0644\u0627\u0645 \u0639\u0644\u06cc\u06a9\u0645\n&#x1f916; Assistant: \u0648\u0639\u0644\u06cc\u06a9\u0645 \u0627\u0644\u0633\u0644\u0627\u0645! \u0645\u06cc\u06ba \u0622\u067e \u06a9\u06cc \u06a9\u06cc\u0627 \u0645\u062f\u062f \u06a9\u0631 \u0633\u06a9\u062a\u0627 \u06c1\u0648\u06ba\u061f\n\n&#x1f464; User: \u067e\u0627\u06a9\u0633\u062a\u0627\u0646 \u06a9\u0627 \u062f\u0627\u0631\u0627\u0644\u062d\u06a9\u0648\u0645\u062a \u06a9\u06cc\u0627 \u06c1\u06d2\u061f\n&#x1f916; Assistant: \u067e\u0627\u06a9\u0633\u062a\u0627\u0646 \u06a9\u0627 \u062f\u0627\u0631\u0627\u0644\u062d\u06a9\u0648\u0645\u062a \u0627\u0633\u0644\u0627\u0645 \u0622\u0628\u0627\u062f \u06c1\u06d2\u06d4 \u06cc\u06c1 \u0634\u06c1\u0631 1960 \u06a9\u06cc \u062f\u06c1\u0627\u0626\u06cc \u0645\u06cc\u06ba \u062a\u0639\u0645\u06cc\u0631 \u06a9\u06cc\u0627 \u06af\u06cc\u0627 \u062a\u06be\u0627\u06d4\n\n&#x1f464; User: \u062a\u0639\u0644\u06cc\u0645 \u06a9\u06cc \u0627\u06c1\u0645\u06cc\u062a \u06a9\u06cc\u0627 \u06c1\u06d2\u061f\n&#x1f916; Assistant: \u062a\u0639\u0644\u06cc\u0645 \u0627\u0646\u0633\u0627\u0646\u06cc \u062a\u0631\u0642\u06cc \u06a9\u06cc \u0628\u0646\u06cc\u0627\u062f \u06c1\u06d2\u06d4 \u06cc\u06c1 \u0646\u06c1 \u0635\u0631\u0641 \u0639\u0644\u0645 \u062d\u0627\u0635\u0644 \u06a9\u0631\u0646\u06d2 \u06a9\u0627 \u0630\u0631\u06cc\u0639\u06c1 \u06c1\u06d2 \u0628\u0644\u06a9\u06c1 \u0645\u0639\u0627\u0634\u0631\u06d2 \u06a9\u06cc \u062a\u0639\u0645\u06cc\u0631 \u0645\u06cc\u06ba \u0627\u06c1\u0645 \u06a9\u0631\u062f\u0627\u0631 \u0627\u062f\u0627 \u06a9\u0631\u062a\u06cc \u06c1\u06d2\u06d4\n<\/code><\/pre>\n<p>\u0627\u06cc\u0633\u06d2 \u0633\u0648\u0627\u0644\u0627\u062a \u06a9\u06d2 \u0644\u06cc\u06d2 \u062c\u0648 \u062a\u0631\u0628\u06cc\u062a\u06cc \u0688\u06cc\u0679\u0627 \u06a9\u06d2 \u0642\u0631\u06cc\u0628 \u06c1\u06cc\u06ba\u060c \u0645\u0627\u0688\u0644 \u062f\u0631\u0633\u062a \u0627\u0648\u0631 \u0631\u0648\u0627\u0646\u06cc \u0633\u06d2 \u062c\u0648\u0627\u0628 \u062f\u06cc\u062a\u0627 \u06c1\u06d2\u06d4 \u062a\u0642\u0633\u06cc\u0645 \u0633\u06d2 \u0628\u0627\u06c1\u0631 \u06a9\u06d2 \u0633\u0648\u0627\u0644\u0627\u062a \u06a9\u06d2 \u0644\u06cc\u06d2\u060c \u0648\u06c1 \u0679\u06a9\u0691\u0648\u06ba \u06a9\u0648 \u0641\u0631\u06cc\u0628 \u062f\u06cc\u0646\u06d2 \u06cc\u0627 \u062f\u06c1\u0631\u0627\u0646\u06d2 \u06a9\u0627 \u0631\u062c\u062d\u0627\u0646 \u0631\u06a9\u06be\u062a\u06d2 \u06c1\u06cc\u06ba\u06d4 \u0686\u06be\u0648\u0679\u06d2 SFT \u0688\u06cc\u0679\u0627\u0633\u06cc\u0679 (79 \u0645\u062b\u0627\u0644\u06cc\u06ba) \u0627\u0648\u0631 \u0645\u0627\u0688\u0644 \u0633\u0627\u0626\u0632 (23M \u067e\u06cc\u0631\u0627\u0645\u06cc\u0679\u0631\u0632) \u06a9\u06d2 \u067e\u06cc\u0634 \u0646\u0638\u0631 \u06cc\u06c1 \u0645\u062a\u0648\u0642\u0639 \u06c1\u06d2\u06d4<\/p>\n<h2 id=\"heading-conclusion\">\u0646\u062a\u06cc\u062c\u06c1<\/h2>\n<p>\u0627\u0633 \u0633\u0628 \u06a9\u0627 \u0645\u0642\u0635\u062f \u0627\u06cc\u0644 \u0627\u06cc\u0644 \u0627\u06cc\u0645 \u0628\u0646\u0627\u0646\u06d2 \u06a9\u06d2 \u0639\u0645\u0644 \u0645\u06cc\u06ba \u0634\u0627\u0645\u0644 \u062a\u0645\u0627\u0645 \u0627\u0642\u062f\u0627\u0645\u0627\u062a \u067e\u0631 \u0628\u062d\u062b \u0627\u0648\u0631 \u0648\u0636\u0627\u062d\u062a \u06a9\u0631\u0646\u0627 \u062a\u06be\u0627\u06d4 \u0644\u0627\u06af\u062a\u060c \u062a\u0631\u0628\u06cc\u062a\u060c \u0688\u06cc\u0679\u0627 \u0627\u06a9\u0679\u06be\u0627 \u06a9\u0631\u0646\u06d2 \u0648\u063a\u06cc\u0631\u06c1 \u062c\u06cc\u0633\u06d2 \u0639\u0648\u0627\u0645\u0644 \u06a9\u06d2 \u0633\u0627\u062a\u06be\u060c \u06c1\u0645 \u0627\u0645\u06cc\u062f \u06a9\u0631\u062a\u06d2 \u06c1\u06cc\u06ba \u06a9\u06c1 \u0627\u0633 \u0633\u06d2 \u06cc\u06c1 \u0648\u0627\u0636\u062d \u06c1\u0648 \u062c\u0627\u0626\u06d2 \u06af\u0627 \u06a9\u06c1 \u06c1\u0631 \u06a9\u0648\u0626\u06cc \u0627\u067e\u0646\u0627 LLM \u06a9\u06cc\u0648\u06ba \u0646\u06c1\u06cc\u06ba \u0628\u0646\u0627\u062a\u0627\u06d4 \u0633\u0627\u062a\u06be \u06c1\u06cc\u060c \u0645\u062c\u06be\u06d2 \u0627\u0645\u06cc\u062f \u06c1\u06d2 \u06a9\u06c1 \u0627\u0633 \u0639\u0645\u0644 \u0633\u06d2 \u06af\u0632\u0631\u0646\u06d2 \u0633\u06d2 \u0622\u067e \u06a9\u0648 \u06cc\u06c1 \u0633\u0645\u062c\u06be\u0646\u06d2 \u0645\u06cc\u06ba \u0645\u062f\u062f \u0645\u0644\u06d2 \u06af\u06cc \u06a9\u06c1 \u0622\u067e \u0646\u06d2 \u067e\u06c1\u0644\u06d2 \u06a9\u06cc\u0627 \u06a9\u06cc\u0627 \u06c1\u06d2 \u0627\u0648\u0631 \u0622\u067e \u0646\u06d2 \u06cc\u06c1\u0627\u06ba \u06a9\u06cc\u0627 \u062d\u0627\u0635\u0644 \u06a9\u06cc\u0627 \u06c1\u06d2\u06d4<\/p>\n<p>\u06c1\u0645 \u062e\u0627\u0645 \u0627\u0631\u062f\u0648 \u0645\u062a\u0646 \u0633\u06d2 \u0627\u06cc\u06a9 \u062a\u0639\u06cc\u0646\u0627\u062a \u0686\u06cc\u0679 \u0628\u0648\u0679 \u067e\u0631 \u0686\u0644\u06d2 \u06af\u0626\u06d2\u06d4 \u0688\u06cc\u0679\u0627 \u06a9\u06cc \u0635\u0641\u0627\u0626\u06cc\u060c \u0628\u06cc \u067e\u06cc \u0627\u06cc \u0679\u0648\u06a9\u0646\u0627\u0626\u0632\u06cc\u0634\u0646\u060c \u062c\u06cc \u067e\u06cc \u0679\u06cc \u0627\u0633\u0679\u0627\u0626\u0644 \u06a9\u0646\u0648\u0631\u0679\u0631 \u067e\u0631\u06cc \u0679\u0631\u06cc\u0646\u0646\u06af\u060c \u0646\u0642\u0635\u0627\u0646 \u06a9\u06cc \u0645\u0627\u0633\u06a9\u0646\u06af \u06a9\u06d2 \u0633\u0627\u062a\u06be \u0641\u0627\u0626\u0646 \u0679\u06cc\u0648\u0646\u0646\u06af \u0646\u06af\u0631\u0627\u0646\u06cc\u060c \u0627\u0648\u0631 \u0622\u062e\u0631 \u0645\u06cc\u06ba \u06af\u0631\u06cc\u0688\u06cc\u0648 \u0648\u06cc\u0628 \u0627\u0646\u0679\u0631\u0641\u06cc\u0633 \u062a\u06be\u0627\u06d4<\/p>\n<p>\u0645\u0627\u0688\u0644 \u0686\u06be\u0648\u0679\u06d2 \u06c1\u06cc\u06ba \u0627\u0648\u0631 \u0688\u06cc\u0679\u0627 \u0633\u06cc\u0679 \u0686\u06be\u0648\u0679\u06d2 \u06c1\u06cc\u06ba\u060c \u0644\u06cc\u06a9\u0646 \u06cc\u06c1\u0627\u06ba \u06a9\u06d2 \u062a\u0645\u0627\u0645 \u062a\u0635\u0648\u0631\u0627\u062a (\u062a\u0648\u062c\u06c1\u060c \u0627\u06af\u0644\u06cc \u0679\u0648\u06a9\u0646 \u067e\u06cc\u0634\u0646 \u06af\u0648\u0626\u06cc\u060c SFT\u060c \u0686\u06cc\u0679 \u0641\u0627\u0631\u0645\u06cc\u0679) \u06a9\u0627 \u0645\u0642\u0635\u062f GPT-4 \u0627\u0648\u0631 Llama \u062c\u06cc\u0633\u06d2 \u067e\u0631\u0648\u0688\u06a9\u0634\u0646 LLMs \u06a9\u0648 \u0628\u06c1\u062a \u0628\u0691\u06d2 \u067e\u06cc\u0645\u0627\u0646\u06d2 \u067e\u0631 \u0633\u067e\u0648\u0631\u0679 \u06a9\u0631\u0646\u0627 \u06c1\u06d2\u06d4<\/p>\n<p>\u0627\u06af\u0631 \u0622\u067e \u0627\u0633\u06d2 \u0628\u06c1\u062a\u0631 \u0628\u0646\u0627\u0646\u0627 \u0686\u0627\u06c1\u062a\u06d2 \u06c1\u06cc\u06ba\u060c \u062a\u0648 \u0627\u06af\u0644\u06d2 \u0627\u0642\u062f\u0627\u0645\u0627\u062a \u062c\u0646 \u067e\u0631 \u0633\u0628 \u0633\u06d2 \u0632\u06cc\u0627\u062f\u06c1 \u0627\u062b\u0631 \u067e\u0691\u06d2 \u06af\u0627 \u0648\u06c1 \u06c1\u06cc\u06ba:<\/p>\n<ol>\n<li>\n<p>\u0645\u0632\u06cc\u062f SFT \u0688\u06cc\u0679\u0627 (79 \u06a9\u06cc \u0628\u062c\u0627\u0626\u06d2 \u06c1\u0632\u0627\u0631\u0648\u06ba \u0645\u062b\u0627\u0644\u06cc\u06ba)\u061b<\/p>\n<\/li>\n<li>\n<p>\u0628\u0691\u06d2 \u0645\u0627\u0688\u0644 (100 \u0645\u0644\u06cc\u0646 \u0633\u06d2 \u0632\u06cc\u0627\u062f\u06c1 \u067e\u06cc\u0631\u0627\u0645\u06cc\u0679\u0631\u0632)<\/p>\n<\/li>\n<li>\n<p>RLHF\/DPO \u0627\u0644\u0627\u0626\u0646\u0645\u0646\u0679\u06d4<\/p>\n<\/li>\n<\/ol>\n<p>\u0644\u06cc\u06a9\u0646 \u0627\u0633 \u067e\u06cc\u0645\u0627\u0646\u06d2 \u067e\u0631 \u0628\u06be\u06cc\u060c \u0627\u0628 \u06c1\u0645\u0627\u0631\u06d2 \u067e\u0627\u0633 \u067e\u0648\u0631\u06cc \u0627\u06cc\u0644 \u0627\u06cc\u0644 \u0627\u06cc\u0645 \u067e\u0627\u0626\u067e \u0644\u0627\u0626\u0646 \u06a9\u06cc \u0679\u06be\u0648\u0633 \u0633\u0645\u062c\u06be \u06c1\u06d2\u06d4<\/p>\n<\/p><\/div>\n","protected":false},"excerpt":{"rendered":"<p>\u06a9\u06cc\u0627 \u06c1\u0648\u06af\u0627 \u0627\u06af\u0631 \u0622\u067e \u0627\u067e\u0646\u06cc \u0645\u0627\u062f\u0631\u06cc \u0632\u0628\u0627\u0646 \u06a9\u0627 \u0627\u0633\u062a\u0639\u0645\u0627\u0644 \u06a9\u0631\u062a\u06d2 \u06c1\u0648\u0626\u06d2 \u0634\u0631\u0648\u0639 \u0633\u06d2 \u0627\u067e\u0646\u0627 LLM \u0628\u0646\u0627 \u0633\u06a9\u062a\u06d2 \u06c1\u06cc\u06ba\u061f \u0628\u0627\u0644\u06a9\u0644 \u06cc\u06c1\u06cc \u06c1\u06d2 \u062c\u0648 \u06c1\u0645 \u0627\u0633 \u0679\u06cc\u0648\u0679\u0648\u0631\u06cc\u0644 \u0645\u06cc\u06ba \u06a9\u0631\u06cc\u06ba \u06af\u06d2\u06d4 \u06cc\u06c1 \u0633\u0645\u062c\u06be\u0646\u06d2 \u06a9\u0627 \u0628\u06c1\u062a\u0631\u06cc\u0646 \u0637\u0631\u06cc\u0642\u06c1 \u06c1\u06d2 \u06a9\u06c1 \u0627\u06cc\u0644 \u0627\u06cc\u0644 \u0627\u06cc\u0645 \u06a9\u06cc\u0633\u06d2 \u06a9\u0627\u0645 \u06a9\u0631\u062a\u0627 \u06c1\u06d2 \u062f\u0631\u0627\u0635\u0644 \u0627\u0633\u06d2 \u0628\u0646\u0627\u0646\u0627 \u06c1\u06d2\u06d4 \u0622\u0626\u06cc\u06d2 \u0627\u06cc\u06a9 \u0645\u062e\u0635\u0648\u0635 \u0632\u0628\u0627\u0646 \u0645\u06cc\u06ba \u0627\u067e\u0646\u0627 \u0627\u06cc\u0644 \u0627\u06cc\u0644 \u0627\u06cc\u0645 [&hellip;]<\/p>\n","protected":false},"author":7,"featured_media":0,"comment_status":"closed","ping_status":"closed","sticky":false,"template":"","format":"standard","meta":{"site-sidebar-layout":"default","site-content-layout":"","ast-site-content-layout":"default","site-content-style":"default","site-sidebar-style":"default","ast-global-header-display":"","ast-banner-title-visibility":"","ast-main-header-display":"","ast-hfb-above-header-display":"","ast-hfb-below-header-display":"","ast-hfb-mobile-header-display":"","site-post-title":"","ast-breadcrumbs-content":"","ast-featured-img":"","footer-sml-layout":"","ast-disable-related-posts":"","theme-transparent-header-meta":"","adv-header-id-meta":"","stick-header-meta":"","header-above-stick-meta":"","header-main-stick-meta":"","header-below-stick-meta":"","astra-migrate-meta-layouts":"default","ast-page-background-enabled":"default","ast-page-background-meta":{"desktop":{"background-color":"var(--ast-global-color-5)","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""},"tablet":{"background-color":"","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""},"mobile":{"background-color":"","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""}},"ast-content-background-meta":{"desktop":{"background-color":"var(--ast-global-color-4)","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""},"tablet":{"background-color":"var(--ast-global-color-4)","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""},"mobile":{"background-color":"var(--ast-global-color-4)","background-image":"","background-repeat":"repeat","background-position":"center center","background-size":"auto","background-attachment":"scroll","background-type":"","background-media":"","overlay-type":"","overlay-color":"","overlay-opacity":"","overlay-gradient":""}},"footnotes":""},"categories":[1],"tags":[],"class_list":["post-23027","post","type-post","status-publish","format-standard","hentry","category-blog"],"_links":{"self":[{"href":"https:\/\/umang.pk\/en_us\/wp-json\/wp\/v2\/posts\/23027","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/umang.pk\/en_us\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/umang.pk\/en_us\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/umang.pk\/en_us\/wp-json\/wp\/v2\/users\/7"}],"replies":[{"embeddable":true,"href":"https:\/\/umang.pk\/en_us\/wp-json\/wp\/v2\/comments?post=23027"}],"version-history":[{"count":0,"href":"https:\/\/umang.pk\/en_us\/wp-json\/wp\/v2\/posts\/23027\/revisions"}],"wp:attachment":[{"href":"https:\/\/umang.pk\/en_us\/wp-json\/wp\/v2\/media?parent=23027"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/umang.pk\/en_us\/wp-json\/wp\/v2\/categories?post=23027"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/umang.pk\/en_us\/wp-json\/wp\/v2\/tags?post=23027"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}