clarinsi · ppisljar · Aug 5, 2023 · Jan 22, 2024 · Jan 24, 2024 · Jan 30, 2024
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+slovene_g2p.egg-info/*
+slovene_g2p/slovene_g2p.egg-info/*
+build
+dist
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1 @@
+recursive-include slovene_g2p/resources/ *
diff --git a/README.md b/README.md
@@ -1,2 +1,16 @@
 # slovene_g2p
 A converter that converts Slovene words to their IPA and/or SAMPA transcriptions.
+
+
+
+## usage
+
+```
+from slovene_g2p.SloveneG2P import SloveneG2P
+g2p = SloveneG2P("ipa_symbol", "cjvt_ipa_detailed_representation", "phoneme_string")
+g2p.convert_to_phonetic_transcription(word="govoriti", msd_sl="Ggdd-em", morphological_pattern_code="G1.2.d")
+```
+
+phoneme_option can be either "ipa_symbol" or "sampa_symbol" and representation option can be either "cjvt_ipa_detailed_representation", "cjvt_ipa_robust_representation", "cjvt_sampa_detailed_representation", "cjvt_sampa_robust_representation"
+
+both msd_sl and morphological_pattern_code are available in sloleks 3.0 and provided by classla python package
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,3 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,3 @@
+nltk>=3.6.7
+classla>=1.1.0
+reldi-tokeniser>=1.0.1
diff --git a/setup.py b/setup.py
@@ -0,0 +1,22 @@
+from setuptools import setup, find_packages
+import os
+
+cwd = os.path.dirname(os.path.abspath(__file__))
+
+requirements = open(os.path.join(cwd, "requirements.txt"), "r").readlines()
+
+with open("README.md", "r", encoding="utf-8") as readme_file:
+    README = readme_file.read()
+
+
+setup(
+    name='slovene_g2p',
+    version='0.0.9',
+    author = "Peter Pisljar",
+    description = "rule based slovenian g2p",
+    long_description=README,
+    install_requires=requirements,
+    packages=find_packages(),
+    python_requires=">=3.8.0, <3.12",
+    include_package_data=True,
+)
diff --git a/slovene_g2p.egg-info/PKG-INFO b/slovene_g2p.egg-info/PKG-INFO
@@ -0,0 +1,27 @@
+Metadata-Version: 2.1
+Name: slovene_g2p
+Version: 0.0.9
+Summary: rule based slovenian g2p
+Author: Peter Pisljar
+Requires-Python: >=3.8.0, <3.12
+License-File: LICENSE
+Requires-Dist: nltk>=3.6.7
+Requires-Dist: classla>=1.1.0
+Requires-Dist: reldi-tokeniser>=1.0.1
+
+# slovene_g2p
+A converter that converts Slovene words to their IPA and/or SAMPA transcriptions.
+
+
+
+## usage
+
+```
+from slovene_g2p.SloveneG2P import SloveneG2P
+g2p = SloveneG2P("ipa_symbol", "cjvt_ipa_detailed_representation", "phoneme_string")
+g2p.convert_to_phonetic_transcription(word="govoriti", msd_sl="Ggdd-em", morphological_pattern_code="G1.2.d")
+```
+
+phoneme_option can be either "ipa_symbol" or "sampa_symbol" and representation option can be either "cjvt_ipa_detailed_representation", "cjvt_ipa_robust_representation", "cjvt_sampa_detailed_representation", "cjvt_sampa_robust_representation"
+
+both msd_sl and morphological_pattern_code are available in sloleks 3.0 and provided by classla python package
diff --git a/slovene_g2p.egg-info/SOURCES.txt b/slovene_g2p.egg-info/SOURCES.txt
@@ -0,0 +1,18 @@
+LICENSE
+MANIFEST.in
+README.md
+pyproject.toml
+setup.py
+slovene_g2p/SloveneG2P.py
+slovene_g2p/__init__.py
+slovene_g2p.egg-info/PKG-INFO
+slovene_g2p.egg-info/SOURCES.txt
+slovene_g2p.egg-info/dependency_links.txt
+slovene_g2p.egg-info/requires.txt
+slovene_g2p.egg-info/top_level.txt
+slovene_g2p/resources/SloveneG2P_phoneme_set.json
+slovene_g2p/resources/schwa_rules.tsv
+slovene_g2p/resources/table_of_consonant_phonemes.tsv
+slovene_g2p/resources/table_of_obstruent_conversions.tsv
+slovene_g2p/resources/table_of_other_symbols.tsv
+slovene_g2p/resources/table_of_vowel_phonemes.tsv
diff --git a/slovene_g2p.egg-info/dependency_links.txt b/slovene_g2p.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
diff --git a/slovene_g2p.egg-info/top_level.txt b/slovene_g2p.egg-info/top_level.txt
@@ -0,0 +1 @@
+slovene_g2p
diff --git a/SloveneG2P.py → slovene_g2p/SloveneG2P.py b/SloveneG2P.py → slovene_g2p/SloveneG2P.py
@@ -1,12 +1,27 @@
 import json
+import os
 from collections import defaultdict as dd
 
+current_folder = os.path.dirname(__file__)
 
 class SloveneG2P:
 
+    def __init__(self):
+        self.ipa_converter = SloveneG2PBase("ipa_symbol", "cjvt_ipa_detailed_representation", "phoneme_string")
+        self.sampa_converter = SloveneG2PBase("sampa_symbol", "cjvt_sampa_detailed_representation", "phoneme_string")
+
+    def ipa(self, word, msd, mpc):
+        return self.ipa_converter.convert_to_phonetic_transcription(word, msd, mpc)
+
+    def sampa(self, word, msd, mpc):
+        return self.sampa_converter.convert_to_phonetic_transcription(word, msd, mpc)
+
+
+class SloveneG2PBase:
+
     def __init__(self, representation_option, phoneme_set_option, output_option):
-        self.phoneme_set_file_path = "./resources/SloveneG2P_phoneme_set.json"
-        self.conversion_file_path = "./resources/table_of_obstruent_conversions.tsv"
+        self.phoneme_set_file_path = os.path.join(current_folder, "resources/SloveneG2P_phoneme_set.json")
+        self.conversion_file_path = os.path.join(current_folder, "resources/table_of_obstruent_conversions.tsv")
         self.representation_option = representation_option
         self.phoneme_set_option = phoneme_set_option
 
@@ -32,15 +47,14 @@ def __init__(self, representation_option, phoneme_set_option, output_option):
 
         # GET LIST OF SCHWA RULES
         self.set_schwa_combinations = set()
-        file_with_schwa_rules = open("./resources/schwa_rules.tsv", "r", encoding="UTF-8").readlines()
+        file_with_schwa_rules = open(os.path.join(current_folder, "resources/schwa_rules.tsv"), "r", encoding="UTF-8").readlines()
         for line in file_with_schwa_rules:
             all_info = line.strip("\n").split("\t")
             morph_code = all_info[0]
             morph_example = all_info[1]
             relevant_msds = all_info[2]
             for relevant_msd in relevant_msds.split(", "):
                 schwa_combination = f"{morph_code} ~ {relevant_msd}"
-                print(schwa_combination)
                 self.set_schwa_combinations.add(schwa_combination)
 
     # RESOURCE FUNCTION - LIST OF VOWEL GRAPHEMES

diff --git a/slovene_g2p/__init__.py b/slovene_g2p/__init__.py
@@ -0,0 +1 @@
+from .SloveneG2P import SloveneG2P
diff --git a/slovene_g2p/__pycache__/SloveneG2P.cpython-310.pyc b/slovene_g2p/__pycache__/SloveneG2P.cpython-310.pyc
diff --git a/slovene_g2p/__pycache__/__init__.cpython-310.pyc b/slovene_g2p/__pycache__/__init__.cpython-310.pyc
diff --git a/resources/SloveneG2P_phoneme_set.json → ...g2p/resources/SloveneG2P_phoneme_set.json b/resources/SloveneG2P_phoneme_set.json → ...g2p/resources/SloveneG2P_phoneme_set.json
diff --git a/resources/schwa_rules.tsv → slovene_g2p/resources/schwa_rules.tsv b/resources/schwa_rules.tsv → slovene_g2p/resources/schwa_rules.tsv
diff --git a/resources/table_of_consonant_phonemes.tsv → ...resources/table_of_consonant_phonemes.tsv b/resources/table_of_consonant_phonemes.tsv → ...resources/table_of_consonant_phonemes.tsv
diff --git a/resources/table_of_obstruent_conversions.tsv → ...ources/table_of_obstruent_conversions.tsv b/resources/table_of_obstruent_conversions.tsv → ...ources/table_of_obstruent_conversions.tsv
@@ -12,4 +12,8 @@ voiceless_to_voiced_obstruent	C_6.1	C_6.2	š	ž
 voiced_to_voiceless_obstruent	C_9.2	C_9.1	dž	č
 voiceless_to_voiced_obstruent	C_9.1	C_9.2	č	dž
 voiced_to_voiceless_obstruent	C_8.2	C_8.1	dz	c
-voiceless_to_voiced_obstruent	C_8.1	C_8.2	c	dz
+voiceless_to_voiced_obstruent	C_8.1	C_8.2	c	dz
+voiceless_to_voiced_obstruent	C_4	C_12.1	f	v
+voiced_to_voiceless_obstruent	C_7.2	C_7.1	ɣ	h
+voiceless_to_voiced_obstruent	C_7.1	C_7.2	h	ɣ
+voiced_to_voiceless_obstruent	C_1.2.3	C_1.1.2	b_f	p_f
diff --git a/resources/table_of_other_symbols.tsv → ..._g2p/resources/table_of_other_symbols.tsv b/resources/table_of_other_symbols.tsv → ..._g2p/resources/table_of_other_symbols.tsv
diff --git a/resources/table_of_vowel_phonemes.tsv → ...g2p/resources/table_of_vowel_phonemes.tsv b/resources/table_of_vowel_phonemes.tsv → ...g2p/resources/table_of_vowel_phonemes.tsv