开发工具:
文件大小: 117kb
下载次数: 0
上传时间: 2019-07-04
详细说明:word2vec 是浅层神经网络训练出来的稠密向量表示的一种方法。应用于自然语言处理过程中。word2vec. c
2014/6/99:31
f(a > MAX STRING -1)a
/ Truncate too long words
114
115word[a]=0;//字符串结束符
116
117
118// Returns hash value of a word
119//计算单词的hash码
120 int GetWordHash(char *word)
unsigned longlong a, hash =0;
122 for (a=o; a< strlen (word)i a++) hash hash k 257 wordla]
123
hash hash vocab hash size
124
return hash
125
126
127//Returns position of a word in the vocabulary if the word is not
128 int Searchvocab(char word) i
129 unsigned int hash GetWordHash(word)i
130whi1e(1
if (vocab hash [hash]
1) return -1
132
if (!strcmp(word, vocab [vocab hash [hash]]. word)) return vocab ha
133
hash =(hash +1)o vocab hash size
135
return -1
136
138 // Reads a word and returns its index in the vocabulary
139//读取一个单词,返回在词表中的索引
140 int ReadWordIndex(FIle *fin)
141
char wordimAX stringii
142 Readword(word fin)
143 if(feof(fin)) return -1
144 return Searchvocab(word)i
145
146
147// Adds a word to the vocabulary
148//把一个单词加入到词表,并返回其在 vocab中的索引
149 int AddwordToVocab(char *word)i
50 unsigned int hash, length strlen(word)+ li
151 if (length MAX STRING) length= MAX STRING;
vocab[vacab size] word =(char *)calloc (length, sizeof(char)
153 strcpy(vocab [vocab size] word, word)
54 vocab vocab size.c
155 vocab size++
156
Reallocate memory if needed
158
司表到达当前上限,增加上限,重新分配词表空间
159 if (vocab size +2 > vocab max size)[
160
vocab max size += 1000:
161
vocab =(struct vocab word x)realloc(vocab, vocab max size x
162
163
164 hash GetWordHash(word)i
165 while (vocab hashhash]!=-1) hash =(hash 1) vocab hash size
166 vocab hash [hash= vocab size -l
167
return vocab size -1
168
Page 3 of 15
word2vec. c
2014/6/99:31
169
170// Used later for sorting by word counts
171//降序排序
172 int VocabCompare(const void *a, const void *b)t
173
return ((struct vocab word *)b)->cn -((struct vocab word *)a)->
174
175
176// Sorts the vocabulary by frequency using word counts
177//通过排序把出现数量少的word排在 vocab word数组的后面(降序排序),记
178 void Sortvocab(
179
int a size
180 unsigned int hash
181 // Sort the vocabulary and keep at the first position
182//按照词频把词表中的单词降序排序,并且使在词表的第一个位置
183 gsort(&vocab[l], vocab size -1, sizeof (struct vocab word), Vocabco
184
185 for (a=o; a< vocab hash size; a++) vocab hash[a
1;//hash码表
186
SIze
cab size
t≌ ain words=0;
188 for (a =o; a< size; a++)i
189
// Words occuring less than min count times will be discarded fro
190
if (vocab[al. cn min count) i
191
vocab size--
192
free(vocab [vocab size. word);
y else i
194
// Hash will be re-computed, as after the sorting it is not ac
195
hash=GetWordHash(vocab[a.word)i
196
hile (vocab hash hash!=-l)hash =(hash 1) vocab hash
vocab hash [hash
ai
198
train words + vocabla. cn
199
200
201
vocah
(struct vocab word x)realloc(vocab, (vocab size l)* size
202
Allocate memory for the binary tree construction
203 for(a=0; a< vocab size; a++)t
204
vocab la] code =(char x calloc(MAX CODE LeNGth sizeof(char))i
205
vocab[al point =(int *)calloc(MAX CODE LENGth, sizeof(int))i
206
207
208
209// Reduces the vocabulary by removing infrequent tokens
210/′删除词频小于 min reduce的非频繁词,精简词表
211//没精简一次词表, min reduce加
212 void ReduceVocab
213 int a, b=0:
214 unsigned int hash
215 for (a=0; a< vocab size, a++)(
216
if (vcab lal.cn> min reduce)
217
vocab [].cn vocab la].cni
218
vocab[b]. word vocab[a]. word;
219
b++
220
221
else free(vocab].word)i
222}
223
V○cas1ze
224 for(a=0ia< vocab hash size a++)vocab hasha
Page 4 of 15
word2vec. C
2014/6/99:31
225 for(a=0r a< vocab size a++)i
226
/ Hash will be re-computed, as it is not actual
227
hash GetWordHash(vocab lal. word)i
228
while (vocab hashhash !=-l)hash =(hash 1) vocab hash si
229
vocab hash [hash] =ai
230
231 fflush(stdout)i
232 min reduce++i
233}
234
235// Create binary Huffman tree using the word counts
236 // Frequent words will have short unige binary codes
237//流程是先在所有的 vacabulary中找2个最小 weight的节点作为叶子节点,we
238//合并成一个父亲节点,从词序列中剔除这两个节点,并加入合成的父亲节点
239 void CreateBinaryTree(
240 long long a, b, i, minli, mini, posl, pos2, point [MAX CODE LENGTH
241
char code [max code length
242// count:词频数组,自动初始化为0
243 long long *count
(longlong *)calloc(vocab size 2 t l, sizeof(
244//记录每个节点与其父节点的关系:词频小的为0,否则为1
245 long long *binary =(long long *)calloc(vocab size *2+ 1, sizeof
246//记录每个节点的父节点
247 long long *parent node =(long long )calloc(vocab size *2 1,s
248
249
for(a=oia< vocab size a++)
250
count la= vocab lal. cni
251 for(a= vocab size a vocab size k 2i a++)
252
countlal lel5;
253
254 posl vocab size
255
os2= vocab size
256
257 // Following algorithm constructs the Huffman tree by adding one no
258 for(a=0; a vocab size -1; a++)i
259
First, find two smallest nodes 'minl, min21
260
if (pCs1 >=0
261
if (count [pas1]< count [pos2])
262
mLn⊥1
poS1
263
pCsl-
264
else i
265
minli= pos2
266
pS2++
267
268
3 else i
269
minli
270
Cs2++
271
272
i(pCs1>=0){
273
if (count lposl]< count [pos2])i
274
min21= pcli
275
pOS1
276
else i
277
mini
cs2:
278
pOS2++i
279
280
y else
Page 5 of 15
word2vec. c
2014/6/99:31
281
mini= pos2
282
pCs2++i
283
284
count vocab size + a
count linli] count lmin2ii
285
parent node[minli
vocab size t ai
286
parent node[mini
vocab size t ai
287
binary[min211=1
288}
289
290// Now assign binary code to each vocabulary word
291 for(a =0; a< vocab size; a++)i
292
b
293
0
294
whi⊥e(1){
295
code
binary[b];//code:从叶节点到根的编码
296
point[i]=b;// point:从叶节点到根的路径上的所有节点
297
1++;
298
b= parent node [b]
299
if(b== vocab size x 2-2) break;
300
301
vocable. codelen
302
vocab[a] point[o] vocab size -2i
303//下面存放每个基本词的路径,注意i-b-1是距离叶子节点最近的父青
304
fcr(b=0; b< ii b++)[
305
vocab la. code li -b-11= code [b]
306
vocab[alpoint[i -b]= point[b] -vocab size
307
308
309
free(count)i
310 free(binary)
311 free(parent node)
312}
313
314 void learnVocabFromTrainfile(i
315 char word LMAX STRING]
316 FIle *fini
317 longlong a, ii
318 for a=or a< vocab hash size a++) vocab hash[a
319 fin fopen(train file, rb")i
320 if(fin = NUlL)I
321
printf( ERROR: training data file not found! \ n")i
322
会又it(1);
323
324
vocab size =o:
325 AddWordToVocab((char x)"");
326whi1e(1){
327
Readword(word, fin)i
328
i(feof(三in)) break;
329
七工 ain words++;
330
if ((debug mode 1)&& (train words 100000==0)
331
printf("olldkoc", train words /1000, 13)i
332
fflush(stdout)i
333
334
1= SearchVocab(word)
335
if(i==-1)
336
a= AddWordToVocab(word)i
Page 6 of 15
word2vec. c
2014/6/99:31
337
vocab la
338
y else vocab [i]. cn++i
339
if (vocab size vocab hash size x 0.7) Reducevocab(i
340
341 Sortvccab(i
342 if(debug mode >0)i
343
printf("Vocab size: olld\n", vocab size)i
344
printf("Words in train file: lld\n", train words)i
345
346 file si
ftell(fin)i
347
fclose(fin)i
348}
350//将词表写入文件 save vocab file
351//格式: word fre(单词频率)
352 void Savevocab
long long i
354 FIlE fc= fopen(save vocab file, wb")i
355 for
0
vocab size: i++)fprintf(fo, "s lld\n", vocab[i]
356
fclose(fo)i
357
358
359 void Readvocab((
360 long long
361
char ci
362 char word[MAX STRING I
363 FILE *fin = fopen (read vocab file,rb")i
364
f (fin
NULL) I
365
printf(" Vocabulary file not found\n")i
366
exit(l)i
367
368 for (a =0; a< vocab hash size; a++) vocab hash[al
369
vocab size
370
371Whi⊥e(1)
372
Readword(word, fin)i
373
if (feof(fin) breaki
374
a AddwordToVocab(word)
375
fscanf(fin, 9lldoc", &vocab [a].cn, &c)i
376
1+I
377
378
379 SortVocab(
380 if (debug mode >0)(
381
printf("Vacab size: lld\n", vocab size)i
382
printf ("Words in train file: olld\n", train words)i
383
384 fin fopen(train file," rb")
385 if (fin ==NULL) I
386
printf("ERROR: training data file not found! \n")i
387
exit (1)
388
389 fseek(fin 0, SEEK eND )i
390 file size ftell(fin)
391
fclose(fin)
392
Page 7 of 15
word2vec. c
2014/6/99:31
393
394 void InitNet(
395 long long a, bi
396 //word embedding matrix
397 a=posix memalign((void **)&syno, 128,(long long)vocab size la
398
399 if (syno = NULL) printf("Memory allocation failed\n"); exit(1)i)
400
if(hs)(
401
a= posix memalign((void x*)&synl, 128, (Long long)vocab size x
02
if (syn1== NULL) printf("Memory allocation failed\n");exit(1)
403
for (b=0i b layerl size b++)
404
for(a =0; a vocab size a++
405
synl[a layerl size b] =0;
406
407 if (negative>0)t
408
posix memalign((void x*)ssynlneg, 128, (long long)vocab size
409
if (synlneg = NUlL)Printf("Memory allocation failed\n")exit
410
for (b=0i b layerl size b++)
for (a=0, a< vocab size; a++)
412
synlneg [a x layerl size b]=0i
413
414
415 for (b=0, b< layerl size b++)
416
for(a =0; a< vocab size, a++)
417
syno[a layerl size b] =(rand()/(real)RAND MAX -0.5)/
418
CreateBinaryfree(i
419}
420
421 void xTrainModel Thread(void *id)
422// sentence position:当前句子中的单词索引,即:当前正在处理句子中的
423// sentence1 ength:sen数组中单词的个数
424
longlong a, b, d, word, last word, sentence length =0, sentence
425/读取一个句子"放在sen数组中,sen数组中存放句子中每个单词在词典中
426
/ word count:该线程口经处理的单词个数
42
long long word count =0, last word count =0, sen [MAX SENTENCE lEl
428 longlong ll, 12, c, target, label
429 unsigned long long next random =(long long)id;
430 real f, gi
431
clock t now
432//CBO模型中,滑动窗口中其他所有词的词向量的和〃
433 real
u1=(real x)calloc (layerl size, sizeof(real))i
434 real neule =(real *)calloc (layerl size, sizeof(real))
435 FIlE *fi= fopen(train file,rb")
436//将训练数据评分给每个进程,将文件指针定位到其负责的数据起始处
437 fseek(fi, file size / (longlong)num threads (longlong)id, SEEK
438whi1e(1){
439
if (word count last word count >10000)(
440
word count actual + word count -last word counti
441
last word count word count;
442
f((debug mode > 1))
443
now=clock(
444
printf("cAlpha: of Progress: 9.2f Words/thread/sec: 0.2
445
word count actual /(real)(train words 1)* 100
446
word count actual /((real)(now -start +1)/(real)ClOCKs
447
fflush(stdout)i
Page s of 15
word2vec. c
2014/6/99:31
449
alpha starting alp
(1 -word count actual /(real)(train
450
if (alpha starting alpha * 00001)alpha starting alpha x
451
452
453
//当 sentence length为0时,读取一个句子,并随机的丢弃高频词
454
if (sentence length ==0)t
455
while (1)f
456
Wrd= ReadWordindex(fi);//返回训练文件中下一个单词在词典中的
457
if (feof(fi)) breaki
458
if (word == -l) continue
459
word count++
460
if (word ==0 break;
461
/ The subsampling randomly discards frequent words while kee
462
/二次采样,随机丢弃高频词
463
if (sample >0)i
464
real ran =(sqrt(vocab[word]. cn /(sample train words)
465
next random next random (unsigned long long)2521490391
466
if (ran (next random OxFFFF)/(real)65536) continue;
467
468
sen[sentence length] word;
469
sentence length++i
470
if (sentence length
MAX SENTENCE LENGTH breaki
471
472
sentence position =0;
473
474
475
if (feof(fi)) breaki
476
/该线程的任务完成,结東
477
f (word count train words num threads) break
4/8
479
word sen sentence position]i
480
if (word ==-1) continue;
481
482
for (c=0
layer1 size; C++) neul[c]=0
483
for (c=0; c layerl size c++) neule[c
484
485
next random next random *(unsigned longlong)25214903917+ 11
486
b next random window
487
if (chow) //train the cbow architecture
488
// in -> hidden
489
for (a =bi a s window x 2 1-b; a++)if (a != window)[
490
c= sentence position - window t ai
491
if (c<0) continue
492
if (c > sentence length) continue;
493
last word sen[c]i
494
f ( last word == -l) continue
495
for (c=0; c< layerl size; C++) neul[c] + synO[c last we
496
497
498
if (hs) for (d =0; d vocab []. codelen; d++)(
499
f
0
500
//此处注意12即为每个父节点的索引,对应到模型中即网x,中的W[par
501
12 vocab[word]point[d] layerl size;
502
// Propagate hidden - output
503
//进行Wx计算
504
for (c=0;c< layerl size, C++)
Page
of 15
wOrd2veC。C
2014/6/99:31
505
I + neul[c]* synl[c 12];
506
507
f(f <=-MAX EXp) continue
508
else if (f>= MAX EXP) continue
509
//进行ex(啊x)查表,即p(x)=f=exp(x)/(1+ep(x)
510
else f= expTable[(int)(f+ MAX EXP)*(EXP TABLE SIZE /MA.
511
512
513
//Loss=xlogp(x)+(1-x)**log(1-p (x))
514
//F Hp(x)=exp(neul[c] synl[c +12])/(1+exp(neul[c] synl
515
//x=1-ccde#作者才此处定义1abe1为1-coe,实际上也可以是ccde
516
lOg(L)=(1-x)* neul[c] synl[c 121 -x*log(1 exp(neu
517
//对1og()中的syn进行偏导,g=(1-coe-p(x))*syn1
518
//因此会有
519
//g=(1- vocab[word]. code [d]-f)*a1pha; alpha学习速率
520
//syn1[c+12]+=g*neu1[c];
521
/'g is the gradient multiplied by the learning rat
522
g=(1-vocab[word] code[d] -f)* alpha
523
// Propagate errors output -> hidden
524
for (c =0; c< layerl size; C++) neule[c] +=g synl[c +1
525
// Learn weights hidden -> output
526
for (c=0; c layerl size c++) synl[c 12
nella
527
528
529
/ NEGATIVE SAMPLING
530
if (negative >0)for (d=0; d< negative l d++)t
531
(d
0)
532
target
wordi
label
1
534
}e⊥se{
535
next random = next random *(unsigned long long)2521490391
536
target table[(next random >> 16) table size]
if(target ==0) target= next random (vocab size -1)+
538
if (target = word) continue;
539
⊥ae⊥=0
540
12= target layerl size
542
f=0
543
for (c =0;c< layerl size C++)f + neul[c] synlneg[c+
544
if (f> MAX EXP)g=(label- 1)* alpha
545
else if (f < -MAX EXp)g=(label -0)* alpha
546
else g=(label-expTable[(int)((f MAX EXP)* (EXP TABLE
547
for (c=0;c< layerl size c++) neule[c] + g synlneglc
548
for (c=0; c< layerl size, C++) synlneg[c +121
k neu
549
5
50
, hidden - in
551
for (a =bi a< window x 2 +1 -b; a++)if(a ! window
552
c= sentence position window +a;
553
if (c<0 continue
554
(c >= sentence length)continue;
555
last word sen lc]
556
if (last word ==-1) continue;
557
for (c =0; c layerl size c++) syno[c last word layer
558
559
y else //train skip-gram
560
for(a=bi a< window x 2+ 1-b; a++) if(a ! window)[
Page 10 of 15
(系统自动生成,下载前可以参看下载内容)
下载文件列表
相关说明
- 本站资源为会员上传分享交流与学习,如有侵犯您的权益,请联系我们删除.
- 本站是交换下载平台,提供交流渠道,下载内容来自于网络,除下载问题外,其它问题请自行百度。
- 本站已设置防盗链,请勿用迅雷、QQ旋风等多线程下载软件下载资源,下载后用WinRAR最新版进行解压.
- 如果您发现内容无法下载,请稍后再次尝试;或者到消费记录里找到下载记录反馈给我们.
- 下载后发现下载的内容跟说明不相乎,请到消费记录里找到下载记录反馈给我们,经确认后退回积分.
- 如下载前有疑问,可以通过点击"提供者"的名字,查看对方的联系方式,联系对方咨询.