Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

allows installing as python library with pip #1

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
slovene_g2p.egg-info/*
slovene_g2p/slovene_g2p.egg-info/*
build
dist
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
recursive-include slovene_g2p/resources/ *
14 changes: 14 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,16 @@
# slovene_g2p
A converter that converts Slovene words to their IPA and/or SAMPA transcriptions.



## usage

```
from slovene_g2p.SloveneG2P import SloveneG2P
g2p = SloveneG2P("ipa_symbol", "cjvt_ipa_detailed_representation", "phoneme_string")
g2p.convert_to_phonetic_transcription(word="govoriti", msd_sl="Ggdd-em", morphological_pattern_code="G1.2.d")
```

phoneme_option can be either "ipa_symbol" or "sampa_symbol" and representation option can be either "cjvt_ipa_detailed_representation", "cjvt_ipa_robust_representation", "cjvt_sampa_detailed_representation", "cjvt_sampa_robust_representation"

both msd_sl and morphological_pattern_code are available in sloleks 3.0 and provided by classla python package
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[build-system]
requires = ["setuptools>=61.0"]
build-backend = "setuptools.build_meta"
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
nltk>=3.6.7
classla>=1.1.0
reldi-tokeniser>=1.0.1
22 changes: 22 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from setuptools import setup, find_packages
import os

cwd = os.path.dirname(os.path.abspath(__file__))

requirements = open(os.path.join(cwd, "requirements.txt"), "r").readlines()

with open("README.md", "r", encoding="utf-8") as readme_file:
README = readme_file.read()


setup(
name='slovene_g2p',
version='0.0.9',
author = "Peter Pisljar",
description = "rule based slovenian g2p",
long_description=README,
install_requires=requirements,
packages=find_packages(),
python_requires=">=3.8.0, <3.12",
include_package_data=True,
)
27 changes: 27 additions & 0 deletions slovene_g2p.egg-info/PKG-INFO
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
Metadata-Version: 2.1
Name: slovene_g2p
Version: 0.0.9
Summary: rule based slovenian g2p
Author: Peter Pisljar
Requires-Python: >=3.8.0, <3.12
License-File: LICENSE
Requires-Dist: nltk>=3.6.7
Requires-Dist: classla>=1.1.0
Requires-Dist: reldi-tokeniser>=1.0.1

# slovene_g2p
A converter that converts Slovene words to their IPA and/or SAMPA transcriptions.



## usage

```
from slovene_g2p.SloveneG2P import SloveneG2P
g2p = SloveneG2P("ipa_symbol", "cjvt_ipa_detailed_representation", "phoneme_string")
g2p.convert_to_phonetic_transcription(word="govoriti", msd_sl="Ggdd-em", morphological_pattern_code="G1.2.d")
```

phoneme_option can be either "ipa_symbol" or "sampa_symbol" and representation option can be either "cjvt_ipa_detailed_representation", "cjvt_ipa_robust_representation", "cjvt_sampa_detailed_representation", "cjvt_sampa_robust_representation"

both msd_sl and morphological_pattern_code are available in sloleks 3.0 and provided by classla python package
18 changes: 18 additions & 0 deletions slovene_g2p.egg-info/SOURCES.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
LICENSE
MANIFEST.in
README.md
pyproject.toml
setup.py
slovene_g2p/SloveneG2P.py
slovene_g2p/__init__.py
slovene_g2p.egg-info/PKG-INFO
slovene_g2p.egg-info/SOURCES.txt
slovene_g2p.egg-info/dependency_links.txt
slovene_g2p.egg-info/requires.txt
slovene_g2p.egg-info/top_level.txt
slovene_g2p/resources/SloveneG2P_phoneme_set.json
slovene_g2p/resources/schwa_rules.tsv
slovene_g2p/resources/table_of_consonant_phonemes.tsv
slovene_g2p/resources/table_of_obstruent_conversions.tsv
slovene_g2p/resources/table_of_other_symbols.tsv
slovene_g2p/resources/table_of_vowel_phonemes.tsv
1 change: 1 addition & 0 deletions slovene_g2p.egg-info/dependency_links.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

1 change: 1 addition & 0 deletions slovene_g2p.egg-info/top_level.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
slovene_g2p
22 changes: 18 additions & 4 deletions SloveneG2P.py → slovene_g2p/SloveneG2P.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,27 @@
import json
import os
from collections import defaultdict as dd

current_folder = os.path.dirname(__file__)

class SloveneG2P:

def __init__(self):
self.ipa_converter = SloveneG2PBase("ipa_symbol", "cjvt_ipa_detailed_representation", "phoneme_string")
self.sampa_converter = SloveneG2PBase("sampa_symbol", "cjvt_sampa_detailed_representation", "phoneme_string")

def ipa(self, word, msd, mpc):
return self.ipa_converter.convert_to_phonetic_transcription(word, msd, mpc)

def sampa(self, word, msd, mpc):
return self.sampa_converter.convert_to_phonetic_transcription(word, msd, mpc)


class SloveneG2PBase:

def __init__(self, representation_option, phoneme_set_option, output_option):
self.phoneme_set_file_path = "./resources/SloveneG2P_phoneme_set.json"
self.conversion_file_path = "./resources/table_of_obstruent_conversions.tsv"
self.phoneme_set_file_path = os.path.join(current_folder, "resources/SloveneG2P_phoneme_set.json")
self.conversion_file_path = os.path.join(current_folder, "resources/table_of_obstruent_conversions.tsv")
self.representation_option = representation_option
self.phoneme_set_option = phoneme_set_option

Expand All @@ -32,15 +47,14 @@ def __init__(self, representation_option, phoneme_set_option, output_option):

# GET LIST OF SCHWA RULES
self.set_schwa_combinations = set()
file_with_schwa_rules = open("./resources/schwa_rules.tsv", "r", encoding="UTF-8").readlines()
file_with_schwa_rules = open(os.path.join(current_folder, "resources/schwa_rules.tsv"), "r", encoding="UTF-8").readlines()
for line in file_with_schwa_rules:
all_info = line.strip("\n").split("\t")
morph_code = all_info[0]
morph_example = all_info[1]
relevant_msds = all_info[2]
for relevant_msd in relevant_msds.split(", "):
schwa_combination = f"{morph_code} ~ {relevant_msd}"
print(schwa_combination)
self.set_schwa_combinations.add(schwa_combination)

# RESOURCE FUNCTION - LIST OF VOWEL GRAPHEMES
Expand Down
1 change: 1 addition & 0 deletions slovene_g2p/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .SloveneG2P import SloveneG2P
Binary file not shown.
Binary file added slovene_g2p/__pycache__/__init__.cpython-310.pyc
Binary file not shown.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,8 @@ voiceless_to_voiced_obstruent C_6.1 C_6.2 š ž
voiced_to_voiceless_obstruent C_9.2 C_9.1 dž č
voiceless_to_voiced_obstruent C_9.1 C_9.2 č dž
voiced_to_voiceless_obstruent C_8.2 C_8.1 dz c
voiceless_to_voiced_obstruent C_8.1 C_8.2 c dz
voiceless_to_voiced_obstruent C_8.1 C_8.2 c dz
voiceless_to_voiced_obstruent C_4 C_12.1 f v
voiced_to_voiceless_obstruent C_7.2 C_7.1 ɣ h
voiceless_to_voiced_obstruent C_7.1 C_7.2 h ɣ
voiced_to_voiceless_obstruent C_1.2.3 C_1.1.2 b_f p_f