From 347b1f344ca476c85a1625d36aa0462f37c4c190 Mon Sep 17 00:00:00 2001 From: romanalakomcikova Date: Fri, 10 Nov 2023 19:57:54 +0100 Subject: [PATCH] Improve Thai Dependency Parser tutorial --- source/notebooks/Thai_Dependency_Parser.ipynb | 414 +++++++++++------- 1 file changed, 257 insertions(+), 157 deletions(-) diff --git a/source/notebooks/Thai_Dependency_Parser.ipynb b/source/notebooks/Thai_Dependency_Parser.ipynb index 38810c6..a2caa3b 100644 --- a/source/notebooks/Thai_Dependency_Parser.ipynb +++ b/source/notebooks/Thai_Dependency_Parser.ipynb @@ -1,167 +1,267 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "W-sH6hmWtM9v" + }, + "source": [ + "# Thai Dependency Parser\n", + "\n", + "PyThaiNLP does not come with a dependency parser. Instead, you can use the dependency parser from [spaCy-Thai](github.com/KoichiYasuoka/spaCy-Thai), which was trained on Universal Dependencies. This tutorial shows you how to get started." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Install spaCy-Thai." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { "colab": { - "name": "Thai_Dependency_Parser.ipynb", - "provenance": [], - "collapsed_sections": [] + "base_uri": "https://localhost:8080/" }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" + "id": "fxkrKlM9s-tc", + "outputId": "6a9796bd-a818-4062-c8ae-bb3856b50515" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting spacy_thai\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/ca/2d/c2e71a4143d6d9cd9db6744e328dfb9f65b98ad7607644d0ad4369bce303/spacy_thai-0.6.2-py3-none-any.whl (5.1MB)\n", + "\u001b[K |████████████████████████████████| 5.1MB 11.2MB/s \n", + "\u001b[?25hCollecting ufal.udpipe>=1.2.0\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/e5/72/2b8b9dc7c80017c790bb3308bbad34b57accfed2ac2f1f4ab252ff4e9cb2/ufal.udpipe-1.2.0.3.tar.gz (304kB)\n", + "\u001b[K |████████████████████████████████| 307kB 45.8MB/s \n", + "\u001b[?25hRequirement already satisfied: spacy>=2.2.2 in /usr/local/lib/python3.7/dist-packages (from spacy_thai) (2.2.4)\n", + "Collecting deplacy>=1.9.2\n", + " Downloading https://files.pythonhosted.org/packages/11/58/87b6286c9578fc456de1363f877228ee0d117b8de238e3e2cd49dbc06eaa/deplacy-1.9.3-py3-none-any.whl\n", + "Collecting pythainlp>=2.2.6\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/c1/09/1215cb6f6ef0cfc9dbb427a961fda8a47c111955f782f659ca2d38c79adc/pythainlp-2.2.6-py3-none-any.whl (10.6MB)\n", + "\u001b[K |████████████████████████████████| 10.6MB 28.7MB/s \n", + "\u001b[?25hRequirement already satisfied: srsly<1.1.0,>=1.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->spacy_thai) (1.0.5)\n", + "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->spacy_thai) (2.23.0)\n", + "Requirement already satisfied: thinc==7.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->spacy_thai) (7.4.0)\n", + "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->spacy_thai) (3.0.5)\n", + "Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->spacy_thai) (0.8.2)\n", + "Requirement already satisfied: plac<1.2.0,>=0.9.6 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->spacy_thai) (1.1.3)\n", + "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->spacy_thai) (2.0.5)\n", + "Requirement already satisfied: blis<0.5.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->spacy_thai) (0.4.1)\n", + "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->spacy_thai) (4.41.1)\n", + "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->spacy_thai) (1.0.5)\n", + "Requirement already satisfied: numpy>=1.15.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->spacy_thai) (1.19.5)\n", + "Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->spacy_thai) (1.0.0)\n", + "Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->spacy_thai) (54.1.2)\n", + "Collecting tinydb>=3.0\n", + " Downloading https://files.pythonhosted.org/packages/af/cd/1ce3d93818cdeda0446b8033d21e5f32daeb3a866bbafd878a9a62058a9c/tinydb-4.4.0-py3-none-any.whl\n", + "Collecting python-crfsuite>=0.9.6\n", + "\u001b[?25l Downloading https://files.pythonhosted.org/packages/79/47/58f16c46506139f17de4630dbcfb877ce41a6355a1bbf3c443edb9708429/python_crfsuite-0.9.7-cp37-cp37m-manylinux1_x86_64.whl (743kB)\n", + "\u001b[K |████████████████████████████████| 747kB 68.5MB/s \n", + "\u001b[?25hRequirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->spacy_thai) (3.0.4)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->spacy_thai) (1.24.3)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->spacy_thai) (2020.12.5)\n", + "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->spacy_thai) (2.10)\n", + "Requirement already satisfied: importlib-metadata>=0.20; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->spacy_thai) (3.7.2)\n", + "Requirement already satisfied: typing-extensions>=3.6.4; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=0.20; python_version < \"3.8\"->catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->spacy_thai) (3.7.4.3)\n", + "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=0.20; python_version < \"3.8\"->catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->spacy_thai) (3.4.1)\n", + "Building wheels for collected packages: ufal.udpipe\n", + " Building wheel for ufal.udpipe (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for ufal.udpipe: filename=ufal.udpipe-1.2.0.3-cp37-cp37m-linux_x86_64.whl size=5626703 sha256=a58565fc21a1f9d3a7c51a3aea138cf612babbefb36ae05cbaccec852b55d967\n", + " Stored in directory: /root/.cache/pip/wheels/0c/9d/db/6d3404c33da5b7adb6c6972853efb6a27649d3ba15f7e9bebb\n", + "Successfully built ufal.udpipe\n", + "Installing collected packages: ufal.udpipe, deplacy, tinydb, python-crfsuite, pythainlp, spacy-thai\n", + "Successfully installed deplacy-1.9.3 pythainlp-2.2.6 python-crfsuite-0.9.7 spacy-thai-0.6.2 tinydb-4.4.0 ufal.udpipe-1.2.0.3\n" + ] } + ], + "source": [ + "!pip install spacy_thai" + ] }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "W-sH6hmWtM9v" - }, - "source": [ - "# Thai Dependency Parser\n", - "\n", - "PyThaiNLP have not ```Thai Dependency Parser```. You can use Dependency Parser from spaCy-Thai.\n", - "\n", - "\n", - "spaCy-Thai work with Universal Dependencies. [github.com/KoichiYasuoka/spaCy-Thai](https://github.com/KoichiYasuoka/spaCy-Thai)\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "fxkrKlM9s-tc", - "outputId": "6a9796bd-a818-4062-c8ae-bb3856b50515" - }, - "source": [ - "!pip install spacy_thai" - ], - "execution_count": 1, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Collecting spacy_thai\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/ca/2d/c2e71a4143d6d9cd9db6744e328dfb9f65b98ad7607644d0ad4369bce303/spacy_thai-0.6.2-py3-none-any.whl (5.1MB)\n", - "\u001b[K |████████████████████████████████| 5.1MB 11.2MB/s \n", - "\u001b[?25hCollecting ufal.udpipe>=1.2.0\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/e5/72/2b8b9dc7c80017c790bb3308bbad34b57accfed2ac2f1f4ab252ff4e9cb2/ufal.udpipe-1.2.0.3.tar.gz (304kB)\n", - "\u001b[K |████████████████████████████████| 307kB 45.8MB/s \n", - "\u001b[?25hRequirement already satisfied: spacy>=2.2.2 in /usr/local/lib/python3.7/dist-packages (from spacy_thai) (2.2.4)\n", - "Collecting deplacy>=1.9.2\n", - " Downloading https://files.pythonhosted.org/packages/11/58/87b6286c9578fc456de1363f877228ee0d117b8de238e3e2cd49dbc06eaa/deplacy-1.9.3-py3-none-any.whl\n", - "Collecting pythainlp>=2.2.6\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/c1/09/1215cb6f6ef0cfc9dbb427a961fda8a47c111955f782f659ca2d38c79adc/pythainlp-2.2.6-py3-none-any.whl (10.6MB)\n", - "\u001b[K |████████████████████████████████| 10.6MB 28.7MB/s \n", - "\u001b[?25hRequirement already satisfied: srsly<1.1.0,>=1.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->spacy_thai) (1.0.5)\n", - "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->spacy_thai) (2.23.0)\n", - "Requirement already satisfied: thinc==7.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->spacy_thai) (7.4.0)\n", - "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->spacy_thai) (3.0.5)\n", - "Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->spacy_thai) (0.8.2)\n", - "Requirement already satisfied: plac<1.2.0,>=0.9.6 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->spacy_thai) (1.1.3)\n", - "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->spacy_thai) (2.0.5)\n", - "Requirement already satisfied: blis<0.5.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->spacy_thai) (0.4.1)\n", - "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->spacy_thai) (4.41.1)\n", - "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->spacy_thai) (1.0.5)\n", - "Requirement already satisfied: numpy>=1.15.0 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->spacy_thai) (1.19.5)\n", - "Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->spacy_thai) (1.0.0)\n", - "Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from spacy>=2.2.2->spacy_thai) (54.1.2)\n", - "Collecting tinydb>=3.0\n", - " Downloading https://files.pythonhosted.org/packages/af/cd/1ce3d93818cdeda0446b8033d21e5f32daeb3a866bbafd878a9a62058a9c/tinydb-4.4.0-py3-none-any.whl\n", - "Collecting python-crfsuite>=0.9.6\n", - "\u001b[?25l Downloading https://files.pythonhosted.org/packages/79/47/58f16c46506139f17de4630dbcfb877ce41a6355a1bbf3c443edb9708429/python_crfsuite-0.9.7-cp37-cp37m-manylinux1_x86_64.whl (743kB)\n", - "\u001b[K |████████████████████████████████| 747kB 68.5MB/s \n", - "\u001b[?25hRequirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->spacy_thai) (3.0.4)\n", - "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->spacy_thai) (1.24.3)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->spacy_thai) (2020.12.5)\n", - "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->spacy_thai) (2.10)\n", - "Requirement already satisfied: importlib-metadata>=0.20; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->spacy_thai) (3.7.2)\n", - "Requirement already satisfied: typing-extensions>=3.6.4; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=0.20; python_version < \"3.8\"->catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->spacy_thai) (3.7.4.3)\n", - "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=0.20; python_version < \"3.8\"->catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->spacy_thai) (3.4.1)\n", - "Building wheels for collected packages: ufal.udpipe\n", - " Building wheel for ufal.udpipe (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for ufal.udpipe: filename=ufal.udpipe-1.2.0.3-cp37-cp37m-linux_x86_64.whl size=5626703 sha256=a58565fc21a1f9d3a7c51a3aea138cf612babbefb36ae05cbaccec852b55d967\n", - " Stored in directory: /root/.cache/pip/wheels/0c/9d/db/6d3404c33da5b7adb6c6972853efb6a27649d3ba15f7e9bebb\n", - "Successfully built ufal.udpipe\n", - "Installing collected packages: ufal.udpipe, deplacy, tinydb, python-crfsuite, pythainlp, spacy-thai\n", - "Successfully installed deplacy-1.9.3 pythainlp-2.2.6 python-crfsuite-0.9.7 spacy-thai-0.6.2 tinydb-4.4.0 ufal.udpipe-1.2.0.3\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "dqhdtyuQs_Mi" - }, - "source": [ - "import spacy_thai\n", - "nlp=spacy_thai.load()" - ], - "execution_count": 2, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "37h9csjAtEZB" - }, - "source": [ - "doc=nlp(\"พวกเราใช้ภาษาไทย\")" - ], - "execution_count": 6, - "outputs": [] + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Import spaCy-Thai." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "dqhdtyuQs_Mi" + }, + "outputs": [], + "source": [ + "import spacy_thai\n", + "nlp=spacy_thai.load()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Do the dependency parse by calling `nlp` on a sentence. " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "37h9csjAtEZB" + }, + "outputs": [], + "source": [ + "doc=nlp(\"พวกเราใช้ภาษาไทย\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can visualize the dependency parse with [deplacy](https://spacy.io/universe/project/deplacy), a tree visualizer for Universal Dependencies." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 365 }, + "id": "IGyncBg6tI_D", + "outputId": "ee2f8ed3-7218-415d-d062-36b76a59fcf9" + }, + "outputs": [ { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 365 - }, - "id": "IGyncBg6tI_D", - "outputId": "ee2f8ed3-7218-415d-d062-36b76a59fcf9" - }, - "source": [ - "import graphviz\n", - "import deplacy\n", - "graphviz.Source(deplacy.dot(doc))" + "data": { + "image/svg+xml": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "deplacy\n", + "\n", + "\n", + "\n", + "r2\n", + "ROOT\n", + "\n", + "\n", + "\n", + "x2\n", + "พวกเราใช้ภาษาไทย\n", + "\n", + "\n", + "\n", + "r2->x2\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "w\n", + "\n", + "พวกเรา\n", + "\n", + "NOUN\n", + "\n", + "ใช้\n", + "\n", + "VERB\n", + "\n", + "ภาษาไทย\n", + "\n", + "PROPN\n", + "\n", + "\n", + "\n", + "x2->w:1\n", + "\n", + "\n", + "nsubj\n", + "\n", + "\n", + "\n", + "x1\n", + "ใช้ภาษาไทย\n", + "\n", + "\n", + "\n", + "x2->x1\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "x1->w:2\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "x1->w:3\n", + "\n", + "\n", + "obj\n", + "\n", + "\n", + "\n" ], - "execution_count": 7, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "" - ], - "image/svg+xml": "\n\n\n\n\n\ndeplacy\n\n\n\nr2\nROOT\n\n\n\nx2\nพวกเราใช้ภาษาไทย\n\n\n\nr2->x2\n\n\n\n\n\nw\n\nพวกเรา\n\nNOUN\n\nใช้\n\nVERB\n\nภาษาไทย\n\nPROPN\n\n\n\nx2->w:1\n\n\nnsubj\n\n\n\nx1\nใช้ภาษาไทย\n\n\n\nx2->x1\n\n\n\n\n\nx1->w:2\n\n\n\n\n\nx1->w:3\n\n\nobj\n\n\n\n" - }, - "metadata": { - "tags": [] - }, - "execution_count": 7 - } + "text/plain": [ + "" ] - }, - { - "cell_type": "code", - "metadata": { - "id": "B5XXoxFttMO2" - }, - "source": [ - "" - ], - "execution_count": null, - "outputs": [] + }, + "execution_count": 4, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" } - ] -} \ No newline at end of file + ], + "source": [ + "import graphviz\n", + "import deplacy\n", + "graphviz.Source(deplacy.dot(doc))" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "Thai_Dependency_Parser.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +}