文件名称: word2vec源码解析.pdf
  所属分类: 深度学习
  文件大小: 117kb
  下载次数: 0
  上传时间: 2019-07-04
  提 供 者: scn****
 详细说明:word2vec 是浅层神经网络训练出来的稠密向量表示的一种方法。应用于自然语言处理过程中。word2vec. c 2014/6/99:31 f(a > MAX STRING -1)a / Truncate too long words 114 115word[a]=0;//字符串结束符 116 117 118// Returns hash value of a word 119//计算单词的hash码 120 int GetWordHash(char *word) unsigned longlong a, hash =0; 122 for (a=o; a< strlen (word)i a++) hash hash k 257 wordla] 123 hash hash vocab hash size 124 return hash 125 126 127//Returns position of a word in the vocabulary if the word is not 128 int Searchvocab(char word) i 129 unsigned int hash GetWordHash(word)i 130whi1e(1 if (vocab hash [hash] 1) return -1 132 if (!strcmp(word, vocab [vocab hash [hash]]. word)) return vocab ha 133 hash =(hash +1)o vocab hash size 135 return -1 136 138 // Reads a word and returns its index in the vocabulary 139//读取一个单词,返回在词表中的索引 140 int ReadWordIndex(FIle *fin) 141 char wordimAX stringii 142 Readword(word fin) 143 if(feof(fin)) return -1 144 return Searchvocab(word)i 145 146 147// Adds a word to the vocabulary 148//把一个单词加入到词表,并返回其在 vocab中的索引 149 int AddwordToVocab(char *word)i 50 unsigned int hash, length strlen(word)+ li 151 if (length MAX STRING) length= MAX STRING; vocab[vacab size] word =(char *)calloc (length, sizeof(char) 153 strcpy(vocab [vocab size] word, word) 54 vocab vocab size.c 155 vocab size++ 156 Reallocate memory if needed 158 司表到达当前上限,增加上限,重新分配词表空间 159 if (vocab size +2 > vocab max size)[ 160 vocab max size += 1000: 161 vocab =(struct vocab word x)realloc(vocab, vocab max size x 162 163 164 hash GetWordHash(word)i 165 while (vocab hashhash]!=-1) hash =(hash 1) vocab hash size 166 vocab hash [hash= vocab size -l 167 return vocab size -1 168 Page 3 of 15 word2vec. c 2014/6/99:31 169 170// Used later for sorting by word counts 171//降序排序 172 int VocabCompare(const void *a, const void *b)t 173 return ((struct vocab word *)b)->cn -((struct vocab word *)a)-> 174 175 176// Sorts the vocabulary by frequency using word counts 177//通过排序把出现数量少的word排在 vocab word数组的后面(降序排序),记 178 void Sortvocab( 179 int a size 180 unsigned int hash 181 // Sort the vocabulary and keep at the first position 182//按照词频把词表中的单词降序排序,并且使在词表的第一个位置 183 gsort(&vocab[l], vocab size -1, sizeof (struct vocab word), Vocabco 184 185 for (a=o; a< vocab hash size; a++) vocab hash[a 1;//hash码表 186 SIze cab size t≌ ain words=0; 188 for (a =o; a< size; a++)i 189 // Words occuring less than min count times will be discarded fro 190 if (vocab[al. cn min count) i 191 vocab size-- 192 free(vocab [vocab size. word); y else i 194 // Hash will be re-computed, as after the sorting it is not ac 195 hash=GetWordHash(vocab[a.word)i 196 hile (vocab hash hash!=-l)hash =(hash 1) vocab hash vocab hash [hash ai 198 train words + vocabla. cn 199 200 201 vocah (struct vocab word x)realloc(vocab, (vocab size l)* size 202 Allocate memory for the binary tree construction 203 for(a=0; a< vocab size; a++)t 204 vocab la] code =(char x calloc(MAX CODE LeNGth sizeof(char))i 205 vocab[al point =(int *)calloc(MAX CODE LENGth, sizeof(int))i 206 207 208 209// Reduces the vocabulary by removing infrequent tokens 210/′删除词频小于 min reduce的非频繁词,精简词表 211//没精简一次词表, min reduce加 212 void ReduceVocab 213 int a, b=0: 214 unsigned int hash 215 for (a=0; a< vocab size, a++)( 216 if (vcab lal.cn> min reduce) 217 vocab [].cn vocab la].cni 218 vocab[b]. word vocab[a]. word; 219 b++ 220 221 else free(vocab].word)i 222} 223 V○cas1ze 224 for(a=0ia< vocab hash size a++)vocab hasha Page 4 of 15 word2vec. C 2014/6/99:31 225 for(a=0r a< vocab size a++)i 226 / Hash will be re-computed, as it is not actual 227 hash GetWordHash(vocab lal. word)i 228 while (vocab hashhash !=-l)hash =(hash 1) vocab hash si 229 vocab hash [hash] =ai 230 231 fflush(stdout)i 232 min reduce++i 233} 234 235// Create binary Huffman tree using the word counts 236 // Frequent words will have short unige binary codes 237//流程是先在所有的 vacabulary中找2个最小 weight的节点作为叶子节点,we 238//合并成一个父亲节点,从词序列中剔除这两个节点,并加入合成的父亲节点 239 void CreateBinaryTree( 240 long long a, b, i, minli, mini, posl, pos2, point [MAX CODE LENGTH 241 char code [max code length 242// count:词频数组,自动初始化为0 243 long long *count (longlong *)calloc(vocab size 2 t l, sizeof( 244//记录每个节点与其父节点的关系:词频小的为0,否则为1 245 long long *binary =(long long *)calloc(vocab size *2+ 1, sizeof 246//记录每个节点的父节点 247 long long *parent node =(long long )calloc(vocab size *2 1,s 248 249 for(a=oia< vocab size a++) 250 count la= vocab lal. cni 251 for(a= vocab size a vocab size k 2i a++) 252 countlal lel5; 253 254 posl vocab size 255 os2= vocab size 256 257 // Following algorithm constructs the Huffman tree by adding one no 258 for(a=0; a vocab size -1; a++)i 259 First, find two smallest nodes 'minl, min21 260 if (pCs1 >=0 261 if (count [pas1]< count [pos2]) 262 mLn⊥1 poS1 263 pCsl- 264 else i 265 minli= pos2 266 pS2++ 267 268 3 else i 269 minli 270 Cs2++ 271 272 i(pCs1>=0){ 273 if (count lposl]< count [pos2])i 274 min21= pcli 275 pOS1 276 else i 277 mini cs2: 278 pOS2++i 279 280 y else Page 5 of 15 word2vec. c 2014/6/99:31 281 mini= pos2 282 pCs2++i 283 284 count vocab size + a count linli] count lmin2ii 285 parent node[minli vocab size t ai 286 parent node[mini vocab size t ai 287 binary[min211=1 288} 289 290// Now assign binary code to each vocabulary word 291 for(a =0; a< vocab size; a++)i 292 b 293 0 294 whi⊥e(1){ 295 code binary[b];//code:从叶节点到根的编码 296 point[i]=b;// point:从叶节点到根的路径上的所有节点 297 1++; 298 b= parent node [b] 299 if(b== vocab size x 2-2) break; 300 301 vocable. codelen 302 vocab[a] point[o] vocab size -2i 303//下面存放每个基本词的路径,注意i-b-1是距离叶子节点最近的父青 304 fcr(b=0; b< ii b++)[ 305 vocab la. code li -b-11= code [b] 306 vocab[alpoint[i -b]= point[b] -vocab size 307 308 309 free(count)i 310 free(binary) 311 free(parent node) 312} 313 314 void learnVocabFromTrainfile(i 315 char word LMAX STRING] 316 FIle *fini 317 longlong a, ii 318 for a=or a< vocab hash size a++) vocab hash[a 319 fin fopen(train file, rb")i 320 if(fin = NUlL)I 321 printf( ERROR: training data file not found! \ n")i 322 会又it(1); 323 324 vocab size =o: 325 AddWordToVocab((char x)""); 326whi1e(1){ 327 Readword(word, fin)i 328 i(feof(三in)) break; 329 七工 ain words++; 330 if ((debug mode 1)&& (train words 100000==0) 331 printf("olldkoc", train words /1000, 13)i 332 fflush(stdout)i 333 334 1= SearchVocab(word) 335 if(i==-1) 336 a= AddWordToVocab(word)i Page 6 of 15 word2vec. c 2014/6/99:31 337 vocab la 338 y else vocab [i]. cn++i 339 if (vocab size vocab hash size x 0.7) Reducevocab(i 340 341 Sortvccab(i 342 if(debug mode >0)i 343 printf("Vocab size: olld\n", vocab size)i 344 printf("Words in train file: lld\n", train words)i 345 346 file si ftell(fin)i 347 fclose(fin)i 348} 350//将词表写入文件 save vocab file 351//格式: word fre(单词频率) 352 void Savevocab long long i 354 FIlE fc= fopen(save vocab file, wb")i 355 for 0 vocab size: i++)fprintf(fo, "s lld\n", vocab[i] 356 fclose(fo)i 357 358 359 void Readvocab(( 360 long long 361 char ci 362 char word[MAX STRING I 363 FILE *fin = fopen (read vocab file,rb")i 364 f (fin NULL) I 365 printf(" Vocabulary file not found\n")i 366 exit(l)i 367 368 for (a =0; a< vocab hash size; a++) vocab hash[al 369 vocab size 370 371Whi⊥e(1) 372 Readword(word, fin)i 373 if (feof(fin) breaki 374 a AddwordToVocab(word) 375 fscanf(fin, 9lldoc", &vocab [a].cn, &c)i 376 1+I 377 378 379 SortVocab( 380 if (debug mode >0)( 381 printf("Vacab size: lld\n", vocab size)i 382 printf ("Words in train file: olld\n", train words)i 383 384 fin fopen(train file," rb") 385 if (fin ==NULL) I 386 printf("ERROR: training data file not found! \n")i 387 exit (1) 388 389 fseek(fin 0, SEEK eND )i 390 file size ftell(fin) 391 fclose(fin) 392 Page 7 of 15 word2vec. c 2014/6/99:31 393 394 void InitNet( 395 long long a, bi 396 //word embedding matrix 397 a=posix memalign((void **)&syno, 128,(long long)vocab size la 398 399 if (syno = NULL) printf("Memory allocation failed\n"); exit(1)i) 400 if(hs)( 401 a= posix memalign((void x*)&synl, 128, (Long long)vocab size x 02 if (syn1== NULL) printf("Memory allocation failed\n");exit(1) 403 for (b=0i b layerl size b++) 404 for(a =0; a vocab size a++ 405 synl[a layerl size b] =0; 406 407 if (negative>0)t 408 posix memalign((void x*)ssynlneg, 128, (long long)vocab size 409 if (synlneg = NUlL)Printf("Memory allocation failed\n")exit 410 for (b=0i b layerl size b++) for (a=0, a< vocab size; a++) 412 synlneg [a x layerl size b]=0i 413 414 415 for (b=0, b< layerl size b++) 416 for(a =0; a< vocab size, a++) 417 syno[a layerl size b] =(rand()/(real)RAND MAX -0.5)/ 418 CreateBinaryfree(i 419} 420 421 void xTrainModel Thread(void *id) 422// sentence position:当前句子中的单词索引,即:当前正在处理句子中的 423// sentence1 ength:sen数组中单词的个数 424 longlong a, b, d, word, last word, sentence length =0, sentence 425/读取一个句子"放在sen数组中,sen数组中存放句子中每个单词在词典中 426 / word count:该线程口经处理的单词个数 42 long long word count =0, last word count =0, sen [MAX SENTENCE lEl 428 longlong ll, 12, c, target, label 429 unsigned long long next random =(long long)id; 430 real f, gi 431 clock t now 432//CBO模型中,滑动窗口中其他所有词的词向量的和〃 433 real u1=(real x)calloc (layerl size, sizeof(real))i 434 real neule =(real *)calloc (layerl size, sizeof(real)) 435 FIlE *fi= fopen(train file,rb") 436//将训练数据评分给每个进程,将文件指针定位到其负责的数据起始处 437 fseek(fi, file size / (longlong)num threads (longlong)id, SEEK 438whi1e(1){ 439 if (word count last word count >10000)( 440 word count actual + word count -last word counti 441 last word count word count; 442 f((debug mode > 1)) 443 now=clock( 444 printf("cAlpha: of Progress: 9.2f Words/thread/sec: 0.2 445 word count actual /(real)(train words 1)* 100 446 word count actual /((real)(now -start +1)/(real)ClOCKs 447 fflush(stdout)i Page s of 15 word2vec. c 2014/6/99:31 449 alpha starting alp (1 -word count actual /(real)(train 450 if (alpha starting alpha * 00001)alpha starting alpha x 451 452 453 //当 sentence length为0时,读取一个句子,并随机的丢弃高频词 454 if (sentence length ==0)t 455 while (1)f 456 Wrd= ReadWordindex(fi);//返回训练文件中下一个单词在词典中的 457 if (feof(fi)) breaki 458 if (word == -l) continue 459 word count++ 460 if (word ==0 break; 461 / The subsampling randomly discards frequent words while kee 462 /二次采样,随机丢弃高频词 463 if (sample >0)i 464 real ran =(sqrt(vocab[word]. cn /(sample train words) 465 next random next random (unsigned long long)2521490391 466 if (ran (next random OxFFFF)/(real)65536) continue; 467 468 sen[sentence length] word; 469 sentence length++i 470 if (sentence length MAX SENTENCE LENGTH breaki 471 472 sentence position =0; 473 474 475 if (feof(fi)) breaki 476 /该线程的任务完成,结東 477 f (word count train words num threads) break 4/8 479 word sen sentence position]i 480 if (word ==-1) continue; 481 482 for (c=0 layer1 size; C++) neul[c]=0 483 for (c=0; c layerl size c++) neule[c 484 485 next random next random *(unsigned longlong)25214903917+ 11 486 b next random window 487 if (chow) //train the cbow architecture 488 // in -> hidden 489 for (a =bi a s window x 2 1-b; a++)if (a != window)[ 490 c= sentence position - window t ai 491 if (c<0) continue 492 if (c > sentence length) continue; 493 last word sen[c]i 494 f ( last word == -l) continue 495 for (c=0; c< layerl size; C++) neul[c] + synO[c last we 496 497 498 if (hs) for (d =0; d vocab []. codelen; d++)( 499 f 0 500 //此处注意12即为每个父节点的索引,对应到模型中即网x,中的W[par 501 12 vocab[word]point[d] layerl size; 502 // Propagate hidden - output 503 //进行Wx计算 504 for (c=0;c< layerl size, C++) Page of 15 wOrd2veC。C 2014/6/99:31 505 I + neul[c]* synl[c 12]; 506 507 f(f <=-MAX EXp) continue 508 else if (f>= MAX EXP) continue 509 //进行ex(啊x)查表,即p(x)=f=exp(x)/(1+ep(x) 510 else f= expTable[(int)(f+ MAX EXP)*(EXP TABLE SIZE /MA. 511 512 513 //Loss=xlogp(x)+(1-x)**log(1-p (x)) 514 //F Hp(x)=exp(neul[c] synl[c +12])/(1+exp(neul[c] synl 515 //x=1-ccde#作者才此处定义1abe1为1-coe,实际上也可以是ccde 516 lOg(L)=(1-x)* neul[c] synl[c 121 -x*log(1 exp(neu 517 //对1og()中的syn进行偏导,g=(1-coe-p(x))*syn1 518 //因此会有 519 //g=(1- vocab[word]. code [d]-f)*a1pha; alpha学习速率 520 //syn1[c+12]+=g*neu1[c]; 521 /'g is the gradient multiplied by the learning rat 522 g=(1-vocab[word] code[d] -f)* alpha 523 // Propagate errors output -> hidden 524 for (c =0; c< layerl size; C++) neule[c] +=g synl[c +1 525 // Learn weights hidden -> output 526 for (c=0; c layerl size c++) synl[c 12 nella 527 528 529 / NEGATIVE SAMPLING 530 if (negative >0)for (d=0; d< negative l d++)t 531 (d 0) 532 target wordi label 1 534 }e⊥se{ 535 next random = next random *(unsigned long long)2521490391 536 target table[(next random >> 16) table size] if(target ==0) target= next random (vocab size -1)+ 538 if (target = word) continue; 539 ⊥ae⊥=0 540 12= target layerl size 542 f=0 543 for (c =0;c< layerl size C++)f + neul[c] synlneg[c+ 544 if (f> MAX EXP)g=(label- 1)* alpha 545 else if (f < -MAX EXp)g=(label -0)* alpha 546 else g=(label-expTable[(int)((f MAX EXP)* (EXP TABLE 547 for (c=0;c< layerl size c++) neule[c] + g synlneglc 548 for (c=0; c< layerl size, C++) synlneg[c +121 k neu 549 5 50 , hidden - in 551 for (a =bi a< window x 2 +1 -b; a++)if(a ! window 552 c= sentence position window +a; 553 if (c<0 continue 554 (c >= sentence length)continue; 555 last word sen lc] 556 if (last word ==-1) continue; 557 for (c =0; c layerl size c++) syno[c last word layer 558 559 y else //train skip-gram 560 for(a=bi a< window x 2+ 1-b; a++) if(a ! window)[ Page 10 of 15



  • 本站资源为会员上传分享交流与学习,如有侵犯您的权益,请联系我们删除.
  • 本站是交换下载平台,提供交流渠道,下载内容来自于网络,除下载问题外,其它问题请自行百度
  • 本站已设置防盗链,请勿用迅雷、QQ旋风等多线程下载软件下载资源,下载后用WinRAR最新版进行解压.
  • 如果您发现内容无法下载,请稍后再次尝试;或者到消费记录里找到下载记录反馈给我们.
  • 下载后发现下载的内容跟说明不相乎,请到消费记录里找到下载记录反馈给我们,经确认后退回积分.
  • 如下载前有疑问,可以通过点击"提供者"的名字,查看对方的联系方式,联系对方咨询.
 相关搜索: word2vec源码解析.pdf