-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathget_embedding.py
35 lines (30 loc) · 1.13 KB
/
get_embedding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# -*- coding: utf-8 -*-
import gzip
import os
import time
import numpy as np
import gc
class GET_EMBEDDING:
stime = time.time()
embedding_path = '/app/embedding'
fasttext_embeddings_index = {}
fasttext_embeddings_index_zh = {}
fasttext_embeddings_index_en = {}
f_zh = gzip.open(os.path.join(embedding_path, 'cc.zh.300.vec.gz'),'rb')
f_en = gzip.open(os.path.join(embedding_path, 'cc.en.300.vec.gz'),'rb')
for line in f_zh.readlines():
values = line.strip().split()
word = values[0].decode('utf8')
coefs = np.asarray(values[1:], dtype='float32')
fasttext_embeddings_index_zh[word] = coefs
embedding_dict_zh = fasttext_embeddings_index_zh
del f_zh, values, word, coefs
gc.collect()
print(f'read zh embedding time: {time.time()-stime}s.')
for line in f_en.readlines():
values = line.strip().split()
word = values[0].decode('utf8')
coefs = np.asarray(values[1:], dtype='float32')
fasttext_embeddings_index_en[word] = coefs
embedding_dict_en = fasttext_embeddings_index_en
print(f'read en embedding time: {time.time()-stime}s.')