Spaces:

GroNLP
/

agalma

Running

App Files Files Community

agalma / autocomplete.py

Mark7549

removed forms for first 2 tabs and used cache to make program faster

cdb0a70 almost 2 years ago

raw

history blame contribute delete

2.32 kB

	import pickle
	import gzip
	from word2vec import *


	def get_unique_words(corpus_filename):
	"""
	Get a list of unique words from a corpus file
	"""
	unique_words = set()
	with open(corpus_filename, 'r', encoding='utf-8') as file:
	for line in file:
	words = line.strip().split()
	unique_words.update(words)
	return list(unique_words)


	def save_compressed_word_list(words, filename):
	"""
	Save a list of words to a compressed file
	"""
	with gzip.open(filename, 'wb') as file:
	pickle.dump(words, file)


	def load_compressed_word_list(filename):
	"""
	Load a list of words from a compressed file
	"""
	with gzip.open(filename, 'rb') as file:
	return pickle.load(file)


	def get_autocomplete(input_word=" ", all_words=" "):
	"""
	Get a list of words that start with the input word
	"""
	return [word for word in all_words if word.startswith(input_word)]


	def custom_sort(item):
	if item.isdigit():
	print(item)
	return (2, item) # Place numbers last
	else:
	return (0, item.lower())


	def order_compressed_list(filename):
	"""
	Order the compressed list of words alphabetically and put numbers at the end
	"""
	# Strip extension from filename
	filename_raw = filename.split('.')[0]

	with gzip.open(filename, 'rb') as file:
	words = pickle.load(file)

	# Sort the words
	sorted_words = sorted(words, key=custom_sort)

	return sorted_words


	def read_compressed_list(filename):
	"""
	Read the compressed list of words
	"""
	with gzip.open(filename, 'rb') as file:
	print(pickle.load(file))


	def word_in_models_dict(words_file):
	"""
	Create a dictionary with words as keys and models in which the word occurs as values
	"""
	with gzip.open(words_file, 'rb') as file:
	words = pickle.load(file)

	models = load_all_models()

	word_models = {word: [] for word in words} # Initialize word_models dictionary with empty lists

	for model in models:
	model_name = convert_model_to_time_name(model[0])
	for word in words:
	if word in model[1].wv.key_to_index:
	word_models[word].append(model_name)

	return word_models