From 9fdf52ecf73d07cdb4d09ce74a9b43df874324d0 Mon Sep 17 00:00:00 2001 From: romanalakomcikova Date: Fri, 10 Nov 2023 17:43:19 +0100 Subject: [PATCH] Improve Thai Chunk Parser tutorial --- source/notebooks/pythainlp_chunk.ipynb | 543 ++++++++++++++----------- 1 file changed, 298 insertions(+), 245 deletions(-) diff --git a/source/notebooks/pythainlp_chunk.ipynb b/source/notebooks/pythainlp_chunk.ipynb index 9ea47f9..931095b 100644 --- a/source/notebooks/pythainlp_chunk.ipynb +++ b/source/notebooks/pythainlp_chunk.ipynb @@ -1,275 +1,328 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "eCfShB9fUSqO" + }, + "source": [ + "# Thai Chunk Parser\n", + "\n", + "This tutorial demonstrates how to use the `chunk_parse` function from the PyThaiNLP library for parsing Thai text into phrases. We will use a chunking model trained on ORCHID++ corpus. \n", + "\n", + "Read more: https://github.com/PyThaiNLP/pythainlp/pull/524" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will need the following libraries and packages: \n", + "- PyThaiNLP\n", + "- NLTK (to preprocess chunk data for visualization)\n", + "- svgling (for visualization)\n", + "- python-crfsuite" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { "colab": { - "provenance": [] + "base_uri": "https://localhost:8080/" }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" + "id": "JvwrS6MDhitW", + "outputId": "ab197d92-b537-4974-e1b5-6bdaa7b8cefd" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: pythainlp in /usr/local/lib/python3.10/dist-packages (4.0.2)\n", + "Requirement already satisfied: svgling in /usr/local/lib/python3.10/dist-packages (0.3.1)\n", + "Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (3.8.1)\n", + "Collecting python-crfsuite\n", + " Downloading python_crfsuite-0.9.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (993 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m993.5/993.5 kB\u001b[0m \u001b[31m8.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: requests>=2.22.0 in /usr/local/lib/python3.10/dist-packages (from pythainlp) (2.31.0)\n", + "Requirement already satisfied: svgwrite in /usr/local/lib/python3.10/dist-packages (from svgling) (1.4.3)\n", + "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk) (8.1.6)\n", + "Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk) (1.3.2)\n", + "Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.10/dist-packages (from nltk) (2023.6.3)\n", + "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from nltk) (4.66.0)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.22.0->pythainlp) (3.2.0)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.22.0->pythainlp) (3.4)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.22.0->pythainlp) (2.0.4)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.22.0->pythainlp) (2023.7.22)\n", + "Installing collected packages: python-crfsuite\n", + "Successfully installed python-crfsuite-0.9.9\n" + ] } + ], + "source": [ + "!pip install pythainlp svgling nltk python-crfsuite" + ] }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "eCfShB9fUSqO" - }, - "source": [ - "# Thai Chunk Parser\n", - "\n", - "In PyThaiNLP, We use chunk data from ORCHID++ corpus.\n", - "\n", - "Read more: https://github.com/PyThaiNLP/pythainlp/pull/524" - ] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "JvwrS6MDhitW", - "outputId": "ab197d92-b537-4974-e1b5-6bdaa7b8cefd" - }, - "source": [ - "!pip install pythainlp svgling nltk python-crfsuite" - ], - "execution_count": 5, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Requirement already satisfied: pythainlp in /usr/local/lib/python3.10/dist-packages (4.0.2)\n", - "Requirement already satisfied: svgling in /usr/local/lib/python3.10/dist-packages (0.3.1)\n", - "Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (3.8.1)\n", - "Collecting python-crfsuite\n", - " Downloading python_crfsuite-0.9.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (993 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m993.5/993.5 kB\u001b[0m \u001b[31m8.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: requests>=2.22.0 in /usr/local/lib/python3.10/dist-packages (from pythainlp) (2.31.0)\n", - "Requirement already satisfied: svgwrite in /usr/local/lib/python3.10/dist-packages (from svgling) (1.4.3)\n", - "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk) (8.1.6)\n", - "Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk) (1.3.2)\n", - "Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.10/dist-packages (from nltk) (2023.6.3)\n", - "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from nltk) (4.66.0)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.22.0->pythainlp) (3.2.0)\n", - "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.22.0->pythainlp) (3.4)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.22.0->pythainlp) (2.0.4)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.22.0->pythainlp) (2023.7.22)\n", - "Installing collected packages: python-crfsuite\n", - "Successfully installed python-crfsuite-0.9.9\n" - ] - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "ZPRRhKxrhlFA" - }, - "source": [ - "from pythainlp.tokenize import word_tokenize\n", - "from pythainlp.tag import pos_tag\n", - "from pythainlp.tag import chunk_parse\n", - "from nltk.chunk import conlltags2tree\n", - "import svgling" - ], - "execution_count": 1, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "bGD2uxMFhmh4" - }, - "source": [ - "def test(txt):\n", - " m = [(w,t) for w,t in pos_tag(word_tokenize(txt), engine= 'perceptron',corpus = 'orchid')]\n", - " tag = chunk_parse(m)\n", - " p = [(w,t,tag[i]) for i,(w,t) in enumerate(m)]\n", - " return p" - ], - "execution_count": 2, - "outputs": [] + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We need to import the following modules and functions:\n", + "- `word_tokenize` – this function takes a Thai text and returns a list of tokenized words\n", + "- `pos_tag` – this function takes a list of tokenized words and marks them with part-of-speech (POS) tags\n", + "- `chunk_parse` – this function takes words with their POS tags and marks them with inside-outside-beginning (IOB) tags\n", + "- `conlltags2tree` – this function is part of the NLTK and converts IOB format to a tree\n", + "- `svgling` – this package will be used to visualize the tree in SVG\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "ZPRRhKxrhlFA" + }, + "outputs": [], + "source": [ + "from pythainlp.tokenize import word_tokenize\n", + "from pythainlp.tag import pos_tag\n", + "from pythainlp.tag import chunk_parse\n", + "from nltk.chunk import conlltags2tree\n", + "import svgling" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We define a new function `test`, which will first segment the input text into words (`word_tokenize`), tag the words with their parts of speech based on the ORCHID++ corpus (`pos_tag`) and perform chunking (`chunk_parse`). The function then combines the words, POS and IOB tags into a list of triples `p`." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "bGD2uxMFhmh4" + }, + "outputs": [], + "source": [ + "def test(txt):\n", + " m = [(w,t) for w,t in pos_tag(word_tokenize(txt), engine= 'perceptron',corpus = 'orchid')]\n", + " tag = chunk_parse(m)\n", + " p = [(w,t,tag[i]) for i,(w,t) in enumerate(m)]\n", + " return p" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, we call the `test` function to chunk several example sentences. We then use the `svgling.draw_tree` function to visualize the syntactic trees, which were generated from the chunked data by the `conlltags2tree` function." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 188 }, + "id": "ag8oszXfhoAZ", + "outputId": "b789de88-d812-44ca-d0d9-4f031127b68d" + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 188 - }, - "id": "ag8oszXfhoAZ", - "outputId": "b789de88-d812-44ca-d0d9-4f031127b68d" - }, - "source": [ - "svgling.draw_tree(conlltags2tree(test(\"แมวกินปลา\")))" + "data": { + "image/svg+xml": [ + "SNPแมวNCMNVPกินVACTปลาNCMN" ], - "execution_count": 3, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "TreeLayout(Tree('S', [Tree('NP', [('แมว', 'NCMN')]), Tree('VP', [('กิน', 'VACT'), ('ปลา', 'NCMN')])]))" - ], - "image/svg+xml": "SNPแมวNCMNVPกินVACTปลาNCMN" - }, - "metadata": {}, - "execution_count": 3 - } + "text/plain": [ + "TreeLayout(Tree('S', [Tree('NP', [('แมว', 'NCMN')]), Tree('VP', [('กิน', 'VACT'), ('ปลา', 'NCMN')])]))" ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "svgling.draw_tree(conlltags2tree(test(\"แมวกินปลา\")))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 188 }, + "id": "L3COVriThp3B", + "outputId": "27256b8d-f265-49cb-c5f1-85fee90b79e4" + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 188 - }, - "id": "L3COVriThp3B", - "outputId": "27256b8d-f265-49cb-c5f1-85fee90b79e4" - }, - "source": [ - "svgling.draw_tree(conlltags2tree(test(\"คนหนองคายเป็นคนน่ารัก\")))" + "data": { + "image/svg+xml": [ + "SNPคนNCMNหนองคายNCMNVPเป็นVSTAคนNCMNน่ารักVATT" ], - "execution_count": 4, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "TreeLayout(Tree('S', [Tree('NP', [('คน', 'NCMN'), ('หนองคาย', 'NCMN')]), Tree('VP', [('เป็น', 'VSTA'), ('คน', 'NCMN'), ('น่ารัก', 'VATT')])]))" - ], - "image/svg+xml": "SNPคนNCMNหนองคายNCMNVPเป็นVSTAคนNCMNน่ารักVATT" - }, - "metadata": {}, - "execution_count": 4 - } + "text/plain": [ + "TreeLayout(Tree('S', [Tree('NP', [('คน', 'NCMN'), ('หนองคาย', 'NCMN')]), Tree('VP', [('เป็น', 'VSTA'), ('คน', 'NCMN'), ('น่ารัก', 'VATT')])]))" ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "svgling.draw_tree(conlltags2tree(test(\"คนหนองคายเป็นคนน่ารัก\")))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 188 }, + "id": "YwaQNhLPib6Y", + "outputId": "1ebc2402-90bf-4a37-8b3e-60b62bb52bae" + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "id": "YwaQNhLPib6Y", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 188 - }, - "outputId": "1ebc2402-90bf-4a37-8b3e-60b62bb52bae" - }, - "source": [ - "svgling.draw_tree(conlltags2tree(test(\"ปลาอะไรอยู่ในน้ำ\")))" + "data": { + "image/svg+xml": [ + "SNPปลาNCMNอะไรPNTRVPอยู่VSTAในRPREน้ำNCMN" ], - "execution_count": 5, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "TreeLayout(Tree('S', [Tree('NP', [('ปลา', 'NCMN'), ('อะไร', 'PNTR')]), Tree('VP', [('อยู่', 'VSTA'), ('ใน', 'RPRE'), ('น้ำ', 'NCMN')])]))" - ], - "image/svg+xml": "SNPปลาNCMNอะไรPNTRVPอยู่VSTAในRPREน้ำNCMN" - }, - "metadata": {}, - "execution_count": 5 - } + "text/plain": [ + "TreeLayout(Tree('S', [Tree('NP', [('ปลา', 'NCMN'), ('อะไร', 'PNTR')]), Tree('VP', [('อยู่', 'VSTA'), ('ใน', 'RPRE'), ('น้ำ', 'NCMN')])]))" ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "svgling.draw_tree(conlltags2tree(test(\"ปลาอะไรอยู่ในน้ำ\")))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 188 }, + "id": "PB7AU2febneD", + "outputId": "32bfea36-c0e1-484a-dbb6-b77536124507" + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 188 - }, - "id": "PB7AU2febneD", - "outputId": "32bfea36-c0e1-484a-dbb6-b77536124507" - }, - "source": [ - "svgling.draw_tree(conlltags2tree(test(\"ในน้ำมีอะไรอยู่\")))" + "data": { + "image/svg+xml": [ + "SNPในRPREน้ำNCMNVPมีVSTAอะไรPNTRอยู่XVAE" ], - "execution_count": 6, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "TreeLayout(Tree('S', [Tree('NP', [('ใน', 'RPRE'), ('น้ำ', 'NCMN')]), Tree('VP', [('มี', 'VSTA'), ('อะไร', 'PNTR'), ('อยู่', 'XVAE')])]))" - ], - "image/svg+xml": "SNPในRPREน้ำNCMNVPมีVSTAอะไรPNTRอยู่XVAE" - }, - "metadata": {}, - "execution_count": 6 - } + "text/plain": [ + "TreeLayout(Tree('S', [Tree('NP', [('ใน', 'RPRE'), ('น้ำ', 'NCMN')]), Tree('VP', [('มี', 'VSTA'), ('อะไร', 'PNTR'), ('อยู่', 'XVAE')])]))" ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "svgling.draw_tree(conlltags2tree(test(\"ในน้ำมีอะไรอยู่\")))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 188 }, + "id": "uu4KZ4OIbqy5", + "outputId": "c49b5cd2-680f-4a44-afe7-8c80368bffa8" + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 188 - }, - "id": "uu4KZ4OIbqy5", - "outputId": "c49b5cd2-680f-4a44-afe7-8c80368bffa8" - }, - "source": [ - "svgling.draw_tree(conlltags2tree(test(\"ทำไมเขารักคุณ\")))" + "data": { + "image/svg+xml": [ + "SNPทำไมNCMNเขาPPRSVPรักVACTคุณPPRS" ], - "execution_count": 7, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "TreeLayout(Tree('S', [Tree('NP', [('ทำไม', 'NCMN'), ('เขา', 'PPRS')]), Tree('VP', [('รัก', 'VACT'), ('คุณ', 'PPRS')])]))" - ], - "image/svg+xml": "SNPทำไมNCMNเขาPPRSVPรักVACTคุณPPRS" - }, - "metadata": {}, - "execution_count": 7 - } + "text/plain": [ + "TreeLayout(Tree('S', [Tree('NP', [('ทำไม', 'NCMN'), ('เขา', 'PPRS')]), Tree('VP', [('รัก', 'VACT'), ('คุณ', 'PPRS')])]))" ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "svgling.draw_tree(conlltags2tree(test(\"ทำไมเขารักคุณ\")))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 188 }, + "id": "xAsZ9PkvbxrG", + "outputId": "1d8c7932-ecf1-4671-a9f7-b2263e3dd80a" + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "id": "xAsZ9PkvbxrG", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 188 - }, - "outputId": "1d8c7932-ecf1-4671-a9f7-b2263e3dd80a" - }, - "source": [ - "svgling.draw_tree(conlltags2tree(test(\"คนอะไรอยู่หลังต้นไม้\")))" + "data": { + "image/svg+xml": [ + "SNPคนNCMNอะไรPNTRVPอยู่VSTAหลังRPREต้นไม้NCMN" ], - "execution_count": 8, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "TreeLayout(Tree('S', [Tree('NP', [('คน', 'NCMN'), ('อะไร', 'PNTR')]), Tree('VP', [('อยู่', 'VSTA'), ('หลัง', 'RPRE'), ('ต้นไม้', 'NCMN')])]))" - ], - "image/svg+xml": "SNPคนNCMNอะไรPNTRVPอยู่VSTAหลังRPREต้นไม้NCMN" - }, - "metadata": {}, - "execution_count": 8 - } + "text/plain": [ + "TreeLayout(Tree('S', [Tree('NP', [('คน', 'NCMN'), ('อะไร', 'PNTR')]), Tree('VP', [('อยู่', 'VSTA'), ('หลัง', 'RPRE'), ('ต้นไม้', 'NCMN')])]))" ] - }, - { - "cell_type": "code", - "metadata": { - "id": "SP3ZlCeQJWpq" - }, - "source": [], - "execution_count": 8, - "outputs": [] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" } - ] -} \ No newline at end of file + ], + "source": [ + "svgling.draw_tree(conlltags2tree(test(\"คนอะไรอยู่หลังต้นไม้\")))" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +}