| import pickle |
| import gzip |
| from word2vec import * |
|
|
|
|
| def get_unique_words(corpus_filename): |
| """ |
| Get a list of unique words from a corpus file |
| """ |
| unique_words = set() |
| with open(corpus_filename, 'r', encoding='utf-8') as file: |
| for line in file: |
| words = line.strip().split() |
| unique_words.update(words) |
| return list(unique_words) |
|
|
|
|
| def save_compressed_word_list(words, filename): |
| """ |
| Save a list of words to a compressed file |
| """ |
| with gzip.open(filename, 'wb') as file: |
| pickle.dump(words, file) |
| |
| |
| def load_compressed_word_list(filename): |
| """ |
| Load a list of words from a compressed file |
| """ |
| with gzip.open(filename, 'rb') as file: |
| return pickle.load(file) |
| |
|
|
| def get_autocomplete(input_word=" ", all_words=" "): |
| """ |
| Get a list of words that start with the input word |
| """ |
| return [word for word in all_words if word.startswith(input_word)] |
|
|
|
|
| def custom_sort(item): |
| if item.isdigit(): |
| print(item) |
| return (2, item) |
| else: |
| return (0, item.lower()) |
|
|
|
|
| def order_compressed_list(filename): |
| """ |
| Order the compressed list of words alphabetically and put numbers at the end |
| """ |
| |
| filename_raw = filename.split('.')[0] |
| |
| with gzip.open(filename, 'rb') as file: |
| words = pickle.load(file) |
|
|
| |
| sorted_words = sorted(words, key=custom_sort) |
| |
| return sorted_words |
| |
| |
| def read_compressed_list(filename): |
| """ |
| Read the compressed list of words |
| """ |
| with gzip.open(filename, 'rb') as file: |
| print(pickle.load(file)) |
|
|
|
|
| def word_in_models_dict(words_file): |
| """ |
| Create a dictionary with words as keys and models in which the word occurs as values |
| """ |
| with gzip.open(words_file, 'rb') as file: |
| words = pickle.load(file) |
|
|
| models = load_all_models() |
|
|
| word_models = {word: [] for word in words} |
|
|
| for model in models: |
| model_name = convert_model_to_time_name(model[0]) |
| for word in words: |
| if word in model[1].wv.key_to_index: |
| word_models[word].append(model_name) |
|
|
| return word_models |
|
|
|
|
| |
| |