logo
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Readme
Files and versions

58 lines
1.8 KiB

import re
def compute_num_pads(list_bboxes):
max_len = -1
for bboxes in list_bboxes:
num_bboxes = len(bboxes)
if num_bboxes > max_len:
max_len = num_bboxes
num_pad_vector = []
for bboxes in list_bboxes:
num_pad_vector.append(max_len - len(bboxes))
return num_pad_vector
def remove_punctuations(sentences):
punctuations = ["''", "'", "``", "`", ".", "?", "!", ",", ":", "-", "--", "...", ";"]
res_sentences_list = []
for i in range(len(sentences)):
res_sentence = []
for word in sentences[i].split(' '):
if word not in punctuations:
res_sentence.append(word)
res_sentences_list.append(' '.join(res_sentence))
return res_sentences_list
def lowercase_and_clean_trailing_spaces(sentences):
return [(sentences[i].lower()).rstrip() for i in range(len(sentences))]
def add_space_between_non_alphanumeric_symbols(sentences):
return [re.sub(r'([^\w0-9])', r" \1 ", sentences[i]) for i in range(len(sentences))]
def tokenize(list_sentences):
res_sentences_list = []
for i in range(len(list_sentences)):
sentence = list_sentences[i].split(' ')
while '' in sentence:
sentence.remove('')
res_sentences_list.append(sentence)
return res_sentences_list
def convert_vector_word2idx(sentence, word2idx_dict):
return [word2idx_dict[word] for word in sentence]
def convert_allsentences_word2idx(sentences, word2idx_dict):
return [convert_vector_word2idx(sentences[i], word2idx_dict) for i in range(len(sentences))]
def convert_vector_idx2word(sentence, idx2word_list):
return [idx2word_list[idx] for idx in sentence]
def convert_allsentences_idx2word(sentences, idx2word_list):
return [convert_vector_idx2word(sentences[i], idx2word_list) for i in range(len(sentences))]