Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Json数据保存到本地数据库(sqlite) #331

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
71 changes: 71 additions & 0 deletions database/culture.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import os

from sqlalchemy import create_engine, select
from sqlalchemy.orm import Session

from modules import *


def row2dict(row):
d = {}
for column in row.__table__.columns:
d[column.name] = str(getattr(row, column.name))
return d


def query_example(_session: Session):
_results = _session.query(SongCi).filter(SongCi.paragraphs.like("%远山%")).filter_by(author="苏轼").all()
for r in _results:
print(row2dict(r))
print(len(_results))

statement = select(SongCi).filter_by(author="李煜")
rows = _session.scalars(statement).all()
for row in rows:
print(str(row2dict(row)))
print(row.author)
break

statement = select(SongCi).filter(SongCi.paragraphs.like("%七夕%"), SongCi.paragraphs.like("%千里%"))
rows = _session.scalars(statement).all() # result = _session.execute(statement);rows = result.scalars()
for row in rows:
_dict = row2dict(row)
_str = str(row2dict(row))
print(_str)


def query_content(_session: Session, _table: Base, _column: Base, _column_value: str, _column_like: Base = None,
_like_str: str = None):
_contents = []
if _column_like and _like_str:
_results = session.query(_table).filter(_column == _column_value, _column_like.like(f"%{_like_str}%")).all()
else:
_results = session.query(_table).filter(_column == _column_value).all()
for _ in _results:
_contents.append(row2dict(_))
return _contents


if __name__ == '__main__':
url = f"""sqlite:///{os.path.join(os.path.dirname(__file__), "culture.db")}"""
engine = create_engine(url, echo=True)
session = Session(engine)
results_ci = session.query(SongCi).filter(SongCi.author == "苏轼", SongCi.rhythmic=="行香子",SongCi.paragraphs.like("%%")).all()
results_shi = session.query(TangShi_Simple).filter(TangShi_Simple.author == "苏轼", TangShi_Simple.paragraphs.like("%情%")).all()
results = results_ci + results_shi
results_dict = [row2dict(i) for i in results]
for i in results_dict:
print("词:",i.get("rhythmic",None), "诗:",i.get("title",None), "content:",i.get("paragraphs",None))
keyword = "情"
results = []
result = session.query(TangShi_Simple).filter(TangShi_Simple.paragraphs.like(f"%{keyword}%")).all()
for r in result:
results.append(row2dict(r))
result = session.query(SongCi).filter(SongCi.paragraphs.like(f"%{keyword}%")).all()
for r in result:
results.append(row2dict(r))
for r in results:
print(r)
print(len(results))
session.close()
...
200 changes: 200 additions & 0 deletions database/migrate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
import json
import os
import time
from os.path import join

import opencc
from sqlalchemy import create_engine
from sqlalchemy.orm import Session

from modules import *


def create_table(is_force=False):
if is_force:
Base.metadata.drop_all(engine)
Base.metadata.create_all(engine)
else:
Base.metadata.create_all(engine, checkfirst=True)


class JsonData(object):
def __init__(self, path, besides=[]):
self.path = path
self.besides = besides

def get_files(self):
files = os.listdir(self.path)
files_new = [os.path.join(self.path, _) for _ in files if _.endswith(".json")]
return files_new

@staticmethod
def filter_files(files, besides: list = [], regex: str = None):
def check_file(file):
is_in = False
for check in besides:
if check in file:
is_in = True
break
return is_in

result = []
for json_file in files:
# 排除文件路径干扰,只取文件名检测
json_file_name = os.path.basename(json_file)
ret = check_file(json_file_name)
if not ret:
result.append(json_file)
if regex:
...
return result

@staticmethod
def get_data(path):
with open(path, 'r', encoding="utf-8") as f:
json_data = json.load(f)
return json_data

def get_all_data(self):
files = self.get_files()
files_new = self.filter_files(files, besides=self.besides)
_temp = []
for file in files_new:
json_data = self.get_data(file)
for d in json_data:
_temp.append(d)
return _temp

def run(self):
return self.get_all_data()


def yuanqu(path, besides, _session: Session):
datas = JsonData(path, besides=besides).run()
for data in datas:
dynasty = data.get("dynasty", "None")
author = data.get("author", "None")
paragraphs = data.get("paragraphs", "None")
paragraphs = "".join(paragraphs)
title = data.get("title", "None")
_session.add(YuanQu(dynasty=dynasty, author=author, paragraphs=paragraphs, title=title))


def huajian(path, besides, _session: Session):
datas = JsonData(path, besides=besides).run()
for data in datas:
title = data.get("title", "None")
paragraphs = data.get("paragraphs", "None")
paragraphs = "".join(paragraphs)
author = data.get("author", "None")
rhythmic = data.get("rhythmic", "None")
notes = data.get("notes", "None")
notes = "".join(notes)
_session.add(HuaJianJi(rhythmic=rhythmic, author=author, paragraphs=paragraphs, title=title, notes=notes))


def quantangshi(path, besides, _session: Session, is_simplified=False):
datas = JsonData(path, besides=besides).run()
converter = opencc.OpenCC('t2s')
for data in datas:
author = data.get("author", "None")
paragraphs = data.get("paragraphs", "None")
paragraphs = "".join(paragraphs)
note = data.get("note", "None")
note = "".join(note)
title = data.get("title", "None")
_session.add(TangShi(author=author, paragraphs=paragraphs, note=note, title=title))
if is_simplified:
_session.add(TangShi_Simple(author=converter.convert(author), paragraphs=converter.convert(paragraphs),
note=converter.convert(note), title=converter.convert(title)))


def sishuwujing(path, besides, _session: Session):
datas = JsonData(path, besides=besides).run()
for data in datas:
if not isinstance(data, dict):
continue
chapter = data.get("chapter", "None")
paragraphs = data.get("paragraphs", "None")
paragraphs = "".join(paragraphs)
_session.add(SiShuWuJing(chapter=chapter, paragraphs=paragraphs))
pass


def songci(path, besides, _session: Session):
datas = JsonData(path, besides=besides).run()
for data in datas:
author = data.get("author", "None")
paragraphs = data.get("paragraphs", "None")
paragraphs = "".join(paragraphs)
rhythmic = data.get("rhythmic", "None")
_session.add(SongCi(author=author, paragraphs=paragraphs, rhythmic=rhythmic))
pass


def chuci(path, besides, _session: Session):
datas = JsonData(path, besides=besides).run()
for data in datas:
section = data.get("section", "None")
author = data.get("author", "None")
title = data.get("title", "None")
content = data.get("content", "None")
content = "".join(content)
_session.add(ChuCi(section=section, author=author, content=content, title=title))


def lunyu(path, besides, _session: Session):
datas = JsonData(path, besides=besides).run()
for data in datas:
chapter = data.get("chapter", "None")
paragraphs = data.get("paragraphs", "None")
paragraphs = "".join(paragraphs)
_session.add(LunYu(chapter=chapter, paragraphs=paragraphs))
pass


def shijing(path, besides, _session: Session):
datas = JsonData(path, besides=besides).run()
for data in datas:
section = data.get("section", "None")
title = data.get("title", "None")
chapter = data.get("chapter", "None")
content = data.get("content", "None")
content = "".join(content)
_session.add(ShiJing(section=section, chapter=chapter, content=content, title=title))


def core(_session: Session):
current_dir = os.path.dirname(__file__)
basedir = os.path.dirname(current_dir)

yuanqu_dir = join(basedir, f"元曲")
hujian_dir = join(basedir, f"五代诗词{os.sep}huajianji")
chuci_dir = join(basedir, f"楚辞")
shijing_dir = join(basedir, f"诗经")
lunyu_dir = join(basedir, f"论语")
sishuwujing_dir = join(basedir, f"四书五经")
songci_dir = join(basedir, f"宋词")
quantangshi_dir = join(basedir, f"全唐诗")

yuanqu(_session=_session, path=yuanqu_dir, besides=[])
huajian(_session=_session, path=hujian_dir, besides=['huajianji-0-preface.json'])
sishuwujing(_session=_session, path=sishuwujing_dir, besides=[])
chuci(_session=_session, path=chuci_dir, besides=[])
lunyu(_session=_session, path=lunyu_dir, besides=[])
shijing(_session=_session, path=shijing_dir, besides=[])
songci(_session=_session, path=songci_dir, besides=["author.song.json"])
quantangshi(_session=_session, path=quantangshi_dir, besides=["authors", "唐诗", "表面结构字"], is_simplified=True)


if __name__ == '__main__':
s = time.time()
url = f"""sqlite:///{os.path.join(os.path.dirname(__file__), "culture.db")}"""
engine = create_engine(url)
session = Session(engine)
create_table(is_force=True)
core(_session=session)
session.commit()
session.close()
e = time.time()
print(f"spend time {int(e - s)} seconds")
75 changes: 75 additions & 0 deletions database/modules.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
from sqlalchemy import Column, Integer, Text
from sqlalchemy import String
from sqlalchemy.orm import declarative_base
from sqlalchemy_repr import RepresentableBase

Base = declarative_base(cls=RepresentableBase)

class YuanQu(Base):
__tablename__ = "YuanQu"
id = Column(Integer, primary_key=True)
dynasty = Column(String(10))
author = Column(String(20))
paragraphs = Column(Text)
title = Column(String(100))

class HuaJianJi(Base):
__tablename__ = "HuaJianJi"
id = Column(Integer, primary_key=True)
title = Column(String(100))
paragraphs = Column(Text)
author = Column(String(20))
rhythmic = Column(String(20))
notes = Column(Text)

class ChuCi(Base):
__tablename__ = "ChuCi"
id = Column(Integer, primary_key=True)
title = Column(String(100))
section = Column(String(20))
author = Column(String(20))
content = Column(Text)

class ShiJing(Base):
__tablename__ = "Shijing"
id = Column(Integer, primary_key=True)
title = Column(String(100))
chapter = Column(String(20))
section = Column(String(20))
content = Column(Text)

class LunYu(Base):
__tablename__ = "LunYu"
id = Column(Integer, primary_key=True)
chapter = Column(String(20))
paragraphs = Column(Text)
class SiShuWuJing(Base):
__tablename__ = "SiShuWuJing"
id = Column(Integer, primary_key=True)
chapter = Column(String(20))
paragraphs = Column(Text)

class SongCi(Base):
__tablename__ = "SongCi"
id = Column(Integer, primary_key=True)
author = Column(String(20))
paragraphs = Column(Text)
rhythmic = Column(String(100))

class TangShi(Base):
__tablename__ = "TangShi"
id = Column(Integer, primary_key=True)
author = Column(String(20))
paragraphs = Column(Text)
note = Column(Text)
title = Column(String(100))

class TangShi_Simple(Base):
__tablename__ = "TangShi_Simple"
id = Column(Integer, primary_key=True)
author = Column(String(20))
paragraphs = Column(Text)
note = Column(Text)
title = Column(String(100))


3 changes: 3 additions & 0 deletions database/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
sqlalchemy == 2.0.12
opencc == 1.1.1
sqlalchemy-repr == 0.1.0