Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

请问作者,data_read('template.txt')中template.txt文件是怎么得到的?第二个脚本里deepLog_hdfs_train.txt文件在data文件夹下也没看到 #23

Open
FLxuRu opened this issue Apr 10, 2021 · 4 comments

Comments

@FLxuRu
Copy link

FLxuRu commented Apr 10, 2021

Sorry for the late reply,
These are the three code snippets I wrote before, run them in orderI hope it will be useful to you!
@huhui ,@arunbaruah ,@nagsubhadeep, @Magical66
1.

# -*- coding: utf-8 -*-
"""
Created on Mon Dec 23 10:54:57 2019

@author: lidongxu1
"""
import re
import spacy
import json

def data_read(filepath):
    fp = open(filepath, "r")
    datas = []  # 存储处理后的数据
    lines = fp.readlines()  # 读取整个文件数据
    i = 0  # 为一行数据
    for line in lines:
        row = line.strip('\n') # 去除两头的换行符,按空格分割
        datas.append(row)
        i = i + 1   
    fp.close()
    return datas

def camel_to_snake(name):
    """
    # To handle more advanced cases specially (this is not reversible anymore):
    # Ref: https://stackoverflow.com/questions/1175208/elegant-python-function-to-convert-camelcase-to-snake-case  
    """
    name = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', name).lower()


def replace_all_blank(value):
    """
    去除value中的所有非字母内容,包括标点符号、空格、换行、下划线等
    :param value: 需要处理的内容
    :return: 返回处理后的内容
    # https://juejin.im/post/5d50c132f265da03de3af40b
    # \W 表示匹配非数字字母下划线
    """
    result = re.sub('\W+', ' ', value).replace("_", ' ')
    result = re.sub('\d',' ',result)
    return result
# https://github.com/explosion/spaCy
# https://github.com/hamelsmu/Seq2Seq_Tutorial/issues/1
nlp = spacy.load('en_core_web_sm')
def lemmatize_stop(text):
    """
    https://stackoverflow.com/questions/45605946/how-to-do-text-pre-processing-using-spacy
    """
#    nlp = spacy.load('en_core_web_sm')
    document = nlp(text)
    # lemmas = [token.lemma_ for token in document if not token.is_stop]
    lemmas = [token.text for token in document if not token.is_stop]
    return lemmas

def dump_2_json(dump_dict, target_path):
    '''
    :param dump_dict: submits dict
    :param target_path: json dst save path
    :return:
    '''
    class MyEncoder(json.JSONEncoder):
        def default(self, obj):
            if isinstance(obj, bytes):
                return str(obj, encoding='utf-8')
            return json.JSONEncoder.default(self, obj)

    file = open(target_path, 'w', encoding='utf-8')
    file.write(json.dumps(dump_dict, cls=MyEncoder, indent=4))
    file.close()

data = data_read('template.txt')
result = {}
for i in range(len(data)):
    temp = data[i]
    temp = camel_to_snake(temp)
    temp = replace_all_blank(temp)
    temp = " ".join(temp.split())
    temp = lemmatize_stop(temp)
    result[i] = temp
print(result)
dump_2_json(result, 'eventid2template.json')





# 单独保存需要用到的fasttext词向量
template_set = set()
for key in result.keys():
    for word in result[key]:
        template_set.add(word)

import io
from tqdm import tqdm

# https://github.com/facebookresearch/fastText/blob/master/docs/crawl-vectors.md
def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in tqdm(fin):
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = map(float, tokens[1:])
    return data

fasttext = load_vectors('cc.en.300.vec')

template_fasttext_map = {}

for word in template_set:
    template_fasttext_map[word] = list(fasttext[word])
    

dump_2_json(template_fasttext_map,'fasttext_map.json')

import os
import json
import numpy as np
import pandas as pd
from collections import Counter
import math

def read_json(filename):
    with open(filename, 'r') as load_f:
        file_dict = json.load(load_f)
    return file_dict

eventid2template = read_json('eventid2template.json')
fasttext_map = read_json('fasttext_map.json')
print(eventid2template)
dataset = list()
with open('data/'+'deepLog_hdfs_train.txt', 'r') as f:
    for line in f.readlines():
        line = tuple(map(lambda n: n - 1, map(int, line.strip().split())))
        dataset.append(line)
print(len(dataset))
idf_matrix = list()
for seq in dataset:
    for event in seq:
        idf_matrix.append(eventid2template[str(event)])
print(len(idf_matrix))
idf_matrix = np.array(idf_matrix)
X_counts = []
for i in range(idf_matrix.shape[0]):
    word_counts = Counter(idf_matrix[i])
    X_counts.append(word_counts)
print(X_counts[1000])
X_df = pd.DataFrame(X_counts)
X_df = X_df.fillna(0)
print(len(X_df))
print(X_df.head())
events = X_df.columns
print(events)
X = X_df.values
num_instance, num_event = X.shape

print('tf-idf here')
df_vec = np.sum(X > 0, axis=0)
print(df_vec)
print('*'*20)
print(num_instance)
# smooth idf like sklearn
idf_vec = np.log((num_instance + 1)  / (df_vec + 1)) + 1
print(idf_vec)
idf_matrix = X * np.tile(idf_vec, (num_instance, 1))
X_new = idf_matrix
print(X_new.shape)
print(X_new[1000])

word2idf = dict()
for i,j in zip(events,idf_vec):
    word2idf[i]=j
    # smooth idf when oov
    word2idf['oov'] = (math.log((num_instance + 1)  / (29+1)) + 1)

print(word2idf)
def dump_2_json(dump_dict, target_path):
    '''
    :param dump_dict: submits dict
    :param target_path: json dst save path
    :return:
    '''
    class MyEncoder(json.JSONEncoder):
        def default(self, obj):
            if isinstance(obj, bytes):
                return str(obj, encoding='utf-8')
            return json.JSONEncoder.default(self, obj)

    file = open(target_path, 'w', encoding='utf-8')
    file.write(json.dumps(dump_dict, cls=MyEncoder, indent=4))
    file.close()

dump_2_json(word2idf,'word2idf.json')
import json
import numpy as np
from collections import Counter

def read_json(filename):
    with open(filename, 'r') as load_f:
        file_dict = json.load(load_f)
    return file_dict

event2template = read_json('eventid2template.json')
fasttext = read_json('fasttext_map.json')
word2idf = read_json('word2idf.json')


event2semantic_vec = dict()
# todo :
# 计算每个seq的tf,然后计算句向量
for event in event2template.keys():
    template = event2template[event]
    tem_len = len(template)
    count = dict(Counter(template))
    for word in count.keys():
        # TF
        TF = count[word]/tem_len
        # IDF
        IDF = word2idf.get(word,word2idf['oov'])
        # print(word)
        # print(TF)
        # print(IDF)
        # print('-'*20)
        count[word] = TF*IDF
    # print(count)
    # print(sum(count.values()))
    value_sum = sum(count.values())
    for word in count.keys():
        count[word] = count[word]/value_sum
    semantic_vec = np.zeros(300)
    for word in count.keys():
        fasttext_weight = np.array(fasttext[word])
        semantic_vec += count[word]*fasttext_weight
    event2semantic_vec[event] = list(semantic_vec)
def dump_2_json(dump_dict, target_path):
    '''
    :param dump_dict: submits dict
    :param target_path: json dst save path
    :return:
    '''
    class MyEncoder(json.JSONEncoder):
        def default(self, obj):
            if isinstance(obj, bytes):
                return str(obj, encoding='utf-8')
            return json.JSONEncoder.default(self, obj)

    file = open(target_path, 'w', encoding='utf-8')
    file.write(json.dumps(dump_dict, cls=MyEncoder, indent=4))
    file.close()

dump_2_json(event2semantic_vec,'event2semantic_vec_sameoov.json')      

    

Originally posted by @donglee-afar in #3 (comment)

@FLxuRu FLxuRu changed the title Sorry for the late reply, 请问作者,data_read('template.txt')中template.txt文件是怎么得到的? Apr 10, 2021
@FLxuRu FLxuRu changed the title 请问作者,data_read('template.txt')中template.txt文件是怎么得到的? 请问作者,data_read('template.txt')中template.txt文件是怎么得到的?第二个脚本里deepLog_hdfs_train.txt文件在data文件夹下也没看到 Apr 10, 2021
@Everyday-seu
Copy link

I also have this question.Do you have any idea now?

@heyd7fc
Copy link

heyd7fc commented Apr 21, 2022

我也有这个问题。你现在有什么想法吗?

Have you solved this problem now?

@X-zhihao
Copy link

Have you solved this problem now?

@Elii-hyy
Copy link

what is the "deepLog_hdfs_train.txt"?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

5 participants