expansionnet-v2
copied
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Readme
Files and versions
58 lines
1.8 KiB
58 lines
1.8 KiB
2 years ago
|
import re
|
||
|
|
||
|
|
||
|
def compute_num_pads(list_bboxes):
|
||
|
max_len = -1
|
||
|
for bboxes in list_bboxes:
|
||
|
num_bboxes = len(bboxes)
|
||
|
if num_bboxes > max_len:
|
||
|
max_len = num_bboxes
|
||
|
num_pad_vector = []
|
||
|
for bboxes in list_bboxes:
|
||
|
num_pad_vector.append(max_len - len(bboxes))
|
||
|
return num_pad_vector
|
||
|
|
||
|
|
||
|
def remove_punctuations(sentences):
|
||
|
punctuations = ["''", "'", "``", "`", ".", "?", "!", ",", ":", "-", "--", "...", ";"]
|
||
|
res_sentences_list = []
|
||
|
for i in range(len(sentences)):
|
||
|
res_sentence = []
|
||
|
for word in sentences[i].split(' '):
|
||
|
if word not in punctuations:
|
||
|
res_sentence.append(word)
|
||
|
res_sentences_list.append(' '.join(res_sentence))
|
||
|
return res_sentences_list
|
||
|
|
||
|
|
||
|
def lowercase_and_clean_trailing_spaces(sentences):
|
||
|
return [(sentences[i].lower()).rstrip() for i in range(len(sentences))]
|
||
|
|
||
|
|
||
|
def add_space_between_non_alphanumeric_symbols(sentences):
|
||
|
return [re.sub(r'([^\w0-9])', r" \1 ", sentences[i]) for i in range(len(sentences))]
|
||
|
|
||
|
|
||
|
def tokenize(list_sentences):
|
||
|
res_sentences_list = []
|
||
|
for i in range(len(list_sentences)):
|
||
|
sentence = list_sentences[i].split(' ')
|
||
|
while '' in sentence:
|
||
|
sentence.remove('')
|
||
|
res_sentences_list.append(sentence)
|
||
|
return res_sentences_list
|
||
|
|
||
|
def convert_vector_word2idx(sentence, word2idx_dict):
|
||
|
return [word2idx_dict[word] for word in sentence]
|
||
|
|
||
|
|
||
|
def convert_allsentences_word2idx(sentences, word2idx_dict):
|
||
|
return [convert_vector_word2idx(sentences[i], word2idx_dict) for i in range(len(sentences))]
|
||
|
|
||
|
|
||
|
def convert_vector_idx2word(sentence, idx2word_list):
|
||
|
return [idx2word_list[idx] for idx in sentence]
|
||
|
|
||
|
|
||
|
def convert_allsentences_idx2word(sentences, idx2word_list):
|
||
|
return [convert_vector_idx2word(sentences[i], idx2word_list) for i in range(len(sentences))]
|