Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
P
project2
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
20200203098
project2
Commits
97303acd
Commit
97303acd
authored
Apr 11, 2020
by
TeacherZhu
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Upload New File
parent
e64c39bf
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
852 additions
and
0 deletions
+852
-0
课件/0411Skip-gram源码讲解/word2vec.c
+852
-0
No files found.
课件/0411Skip-gram源码讲解/word2vec.c
0 → 100644
View file @
97303acd
// Copyright 2013 Google Inc. All Rights Reserved.
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <pthread.h>
#define MAX_STRING 100
#define EXP_TABLE_SIZE 1000
#define MAX_EXP 6
#define MAX_SENTENCE_LENGTH 1000
#define MAX_CODE_LENGTH 40
// 构建的全局变量
const
int
vocab_hash_size
=
30000000
;
// Maximum 30 * 0.7 = 21M words in the vocabulary
// 定义的浮点数
typedef
float
real
;
// Precision of float numbers
// 词的结构体
struct
vocab_word
{
long
long
cn
;
// 出现的次数
int
*
point
;
// 从根结点到叶子节点的路径
char
*
word
,
*
code
,
codelen
;
// 分别对应着词,Huffman编码,编码长度
};
char
train_file
[
MAX_STRING
],
output_file
[
MAX_STRING
];
// 训练文件,输出文件
char
save_vocab_file
[
MAX_STRING
],
read_vocab_file
[
MAX_STRING
];
struct
vocab_word
*
vocab
;
// 出现的词的统计
// 初始化参数
int
binary
=
0
,
cbow
=
1
,
debug_mode
=
2
,
window
=
5
,
min_count
=
5
,
num_threads
=
12
,
min_reduce
=
1
;
int
*
vocab_hash
;
// 存储词的hash
long
long
vocab_max_size
=
1000
,
vocab_size
=
0
,
layer1_size
=
100
;
long
long
train_words
=
0
,
word_count_actual
=
0
,
iter
=
5
,
file_size
=
0
,
classes
=
0
;
real
alpha
=
0
.
025
,
starting_alpha
,
sample
=
1e-3
;
real
*
syn0
,
*
syn1
,
*
syn1neg
,
*
expTable
;
clock_t
start
;
int
hs
=
0
,
negative
=
5
;
const
int
table_size
=
1e8
;
int
*
table
;
// 生成负采样的概率表
void
InitUnigramTable
()
{
int
a
,
i
;
double
train_words_pow
=
0
;
double
d1
,
power
=
0
.
75
;
table
=
(
int
*
)
malloc
(
table_size
*
sizeof
(
int
));
// int --> int
for
(
a
=
0
;
a
<
vocab_size
;
a
++
)
train_words_pow
+=
pow
(
vocab
[
a
].
cn
,
power
);
// 类似轮盘赌生成每个词的概率
i
=
0
;
d1
=
pow
(
vocab
[
i
].
cn
,
power
)
/
train_words_pow
;
for
(
a
=
0
;
a
<
table_size
;
a
++
)
{
table
[
a
]
=
i
;
if
(
a
/
(
double
)
table_size
>
d1
)
{
i
++
;
d1
+=
pow
(
vocab
[
i
].
cn
,
power
)
/
train_words_pow
;
}
if
(
i
>=
vocab_size
)
i
=
vocab_size
-
1
;
}
}
// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
// 读取每一个词
void
ReadWord
(
char
*
word
,
FILE
*
fin
)
{
int
a
=
0
,
ch
;
while
(
!
feof
(
fin
))
{
ch
=
fgetc
(
fin
);
if
(
ch
==
13
)
continue
;
// 回车,\r
if
((
ch
==
' '
)
||
(
ch
==
'\t'
)
||
(
ch
==
'\n'
))
{
if
(
a
>
0
)
{
// 当前的词还没结束
if
(
ch
==
'\n'
)
ungetc
(
ch
,
fin
);
break
;
}
if
(
ch
==
'\n'
)
{
strcpy
(
word
,
(
char
*
)
"</s>"
);
// 换行符用</s>表示
return
;
}
else
continue
;
}
word
[
a
]
=
ch
;
a
++
;
if
(
a
>=
MAX_STRING
-
1
)
a
--
;
// Truncate too long words
}
word
[
a
]
=
0
;
}
// Returns hash value of a word
// 取词的hash值
int
GetWordHash
(
char
*
word
)
{
unsigned
long
long
a
,
hash
=
0
;
for
(
a
=
0
;
a
<
strlen
(
word
);
a
++
)
hash
=
hash
*
257
+
word
[
a
];
hash
=
hash
%
vocab_hash_size
;
return
hash
;
}
// Returns position of a word in the vocabulary; if the word is not found, returns -1
// 查找词在词库中的位置,若没有查找到则返回-1
int
SearchVocab
(
char
*
word
)
{
unsigned
int
hash
=
GetWordHash
(
word
);
while
(
1
)
{
if
(
vocab_hash
[
hash
]
==
-
1
)
return
-
1
;
// 不存在该词
if
(
!
strcmp
(
word
,
vocab
[
vocab_hash
[
hash
]].
word
))
return
vocab_hash
[
hash
];
// 返回索引值
hash
=
(
hash
+
1
)
%
vocab_hash_size
;
}
return
-
1
;
// 不存在该词
}
// Reads a word and returns its index in the vocabulary
// 返回的是在词库中的位置
int
ReadWordIndex
(
FILE
*
fin
)
{
char
word
[
MAX_STRING
];
ReadWord
(
word
,
fin
);
if
(
feof
(
fin
))
return
-
1
;
return
SearchVocab
(
word
);
}
// Adds a word to the vocabulary
// 为词库中增加一个词
int
AddWordToVocab
(
char
*
word
)
{
unsigned
int
hash
,
length
=
strlen
(
word
)
+
1
;
// 单词的长度+1
if
(
length
>
MAX_STRING
)
length
=
MAX_STRING
;
vocab
[
vocab_size
].
word
=
(
char
*
)
calloc
(
length
,
sizeof
(
char
));
//开始的位置增加指定的词
strcpy
(
vocab
[
vocab_size
].
word
,
word
);
vocab
[
vocab_size
].
cn
=
0
;
vocab_size
++
;
// Reallocate memory if needed
if
(
vocab_size
+
2
>=
vocab_max_size
)
{
vocab_max_size
+=
1000
;
vocab
=
(
struct
vocab_word
*
)
realloc
(
vocab
,
vocab_max_size
*
sizeof
(
struct
vocab_word
));
}
hash
=
GetWordHash
(
word
);
// 对增加的词hash
while
(
vocab_hash
[
hash
]
!=
-
1
)
hash
=
(
hash
+
1
)
%
vocab_hash_size
;
// hash的碰撞检测
vocab_hash
[
hash
]
=
vocab_size
-
1
;
// 词的hash值->词的词库中的索引
return
vocab_size
-
1
;
}
// Used later for sorting by word counts
int
VocabCompare
(
const
void
*
a
,
const
void
*
b
)
{
return
((
struct
vocab_word
*
)
b
)
->
cn
-
((
struct
vocab_word
*
)
a
)
->
cn
;
}
// Sorts the vocabulary by frequency using word counts
// 根据词出现的频率对词库中的词排序
void
SortVocab
()
{
int
a
,
size
;
unsigned
int
hash
;
// Sort the vocabulary and keep </s> at the first position
qsort
(
&
vocab
[
1
],
vocab_size
-
1
,
sizeof
(
struct
vocab_word
),
VocabCompare
);
// 排完序后需要重新做hash运算
for
(
a
=
0
;
a
<
vocab_hash_size
;
a
++
)
vocab_hash
[
a
]
=
-
1
;
size
=
vocab_size
;
train_words
=
0
;
for
(
a
=
0
;
a
<
size
;
a
++
)
{
// Words occuring less than min_count times will be discarded from the vocab
// 根据min_count对低频词的处理
if
((
vocab
[
a
].
cn
<
min_count
)
&&
(
a
!=
0
))
{
vocab_size
--
;
free
(
vocab
[
a
].
word
);
}
else
{
// Hash will be re-computed, as after the sorting it is not actual
hash
=
GetWordHash
(
vocab
[
a
].
word
);
while
(
vocab_hash
[
hash
]
!=
-
1
)
hash
=
(
hash
+
1
)
%
vocab_hash_size
;
vocab_hash
[
hash
]
=
a
;
train_words
+=
vocab
[
a
].
cn
;
}
}
vocab
=
(
struct
vocab_word
*
)
realloc
(
vocab
,
(
vocab_size
+
1
)
*
sizeof
(
struct
vocab_word
));
// Allocate memory for the binary tree construction
// 为构建huffman树申请空间
for
(
a
=
0
;
a
<
vocab_size
;
a
++
)
{
vocab
[
a
].
code
=
(
char
*
)
calloc
(
MAX_CODE_LENGTH
,
sizeof
(
char
));
vocab
[
a
].
point
=
(
int
*
)
calloc
(
MAX_CODE_LENGTH
,
sizeof
(
int
));
}
}
// Reduces the vocabulary by removing infrequent tokens
// 删除频率较小的词
void
ReduceVocab
()
{
int
a
,
b
=
0
;
unsigned
int
hash
;
// 通过min_reduce控制
for
(
a
=
0
;
a
<
vocab_size
;
a
++
)
if
(
vocab
[
a
].
cn
>
min_reduce
)
{
vocab
[
b
].
cn
=
vocab
[
a
].
cn
;
vocab
[
b
].
word
=
vocab
[
a
].
word
;
b
++
;
}
else
free
(
vocab
[
a
].
word
);
vocab_size
=
b
;
// 删减后词的个数
// 重新进行hash操作
for
(
a
=
0
;
a
<
vocab_hash_size
;
a
++
)
vocab_hash
[
a
]
=
-
1
;
for
(
a
=
0
;
a
<
vocab_size
;
a
++
)
{
// Hash will be re-computed, as it is not actual
hash
=
GetWordHash
(
vocab
[
a
].
word
);
while
(
vocab_hash
[
hash
]
!=
-
1
)
hash
=
(
hash
+
1
)
%
vocab_hash_size
;
vocab_hash
[
hash
]
=
a
;
}
fflush
(
stdout
);
min_reduce
++
;
}
// Create binary Huffman tree using the word counts
// Frequent words will have short uniqe binary codes
// 根据词库中的词频构建Huffman树
void
CreateBinaryTree
()
{
long
long
a
,
b
,
i
,
min1i
,
min2i
,
pos1
,
pos2
,
point
[
MAX_CODE_LENGTH
];
char
code
[
MAX_CODE_LENGTH
];
// 申请2倍的词的空间,(在这里完全没有必要申请这么多的空间)
long
long
*
count
=
(
long
long
*
)
calloc
(
vocab_size
*
2
+
1
,
sizeof
(
long
long
));
long
long
*
binary
=
(
long
long
*
)
calloc
(
vocab_size
*
2
+
1
,
sizeof
(
long
long
));
long
long
*
parent_node
=
(
long
long
*
)
calloc
(
vocab_size
*
2
+
1
,
sizeof
(
long
long
));
// 分成两半进行初始化
for
(
a
=
0
;
a
<
vocab_size
;
a
++
)
count
[
a
]
=
vocab
[
a
].
cn
;
// 前半部分初始化为每个词出现的次数
for
(
a
=
vocab_size
;
a
<
vocab_size
*
2
;
a
++
)
count
[
a
]
=
1e15
;
// 后半部分初始化为一个固定的常数
// 两个指针:
// pos1指向前半截的尾部
// pos2指向后半截的开始
pos1
=
vocab_size
-
1
;
pos2
=
vocab_size
;
// Following algorithm constructs the Huffman tree by adding one node at a time
// 每次增加一个节点,构建Huffman树
for
(
a
=
0
;
a
<
vocab_size
-
1
;
a
++
)
{
// First, find two smallest nodes 'min1, min2'
// 选择最小的节点min1
if
(
pos1
>=
0
)
{
if
(
count
[
pos1
]
<
count
[
pos2
])
{
min1i
=
pos1
;
pos1
--
;
}
else
{
min1i
=
pos2
;
pos2
++
;
}
}
else
{
min1i
=
pos2
;
pos2
++
;
}
// 选择最小的节点min2
if
(
pos1
>=
0
)
{
if
(
count
[
pos1
]
<
count
[
pos2
])
{
min2i
=
pos1
;
pos1
--
;
}
else
{
min2i
=
pos2
;
pos2
++
;
}
}
else
{
min2i
=
pos2
;
pos2
++
;
}
count
[
vocab_size
+
a
]
=
count
[
min1i
]
+
count
[
min2i
];
// 设置父节点
parent_node
[
min1i
]
=
vocab_size
+
a
;
parent_node
[
min2i
]
=
vocab_size
+
a
;
binary
[
min2i
]
=
1
;
// 设置一个子树的编码为1
}
// Now assign binary code to each vocabulary word
// 为每一个词分配二进制编码,即Huffman编码
for
(
a
=
0
;
a
<
vocab_size
;
a
++
)
{
// 针对每一个词
b
=
a
;
i
=
0
;
while
(
1
)
{
code
[
i
]
=
binary
[
b
];
// 找到当前的节点的编码
point
[
i
]
=
b
;
// 记录从叶子节点到根结点的序列
i
++
;
b
=
parent_node
[
b
];
// 找到当前节点的父节点
if
(
b
==
vocab_size
*
2
-
2
)
break
;
// 已经找到了根结点,根节点是没有编码的
}
vocab
[
a
].
codelen
=
i
;
// 词的编码长度
vocab
[
a
].
point
[
0
]
=
vocab_size
-
2
;
// 根结点
for
(
b
=
0
;
b
<
i
;
b
++
)
{
vocab
[
a
].
code
[
i
-
b
-
1
]
=
code
[
b
];
// 编码的反转
vocab
[
a
].
point
[
i
-
b
]
=
point
[
b
]
-
vocab_size
;
// 记录的是从根结点到叶子节点的路径
}
}
free
(
count
);
free
(
binary
);
free
(
parent_node
);
}
// 读取输入的文件,并从输入文件中构建词库
void
LearnVocabFromTrainFile
()
{
char
word
[
MAX_STRING
];
// 存储每一个单词
FILE
*
fin
;
long
long
a
,
i
;
for
(
a
=
0
;
a
<
vocab_hash_size
;
a
++
)
vocab_hash
[
a
]
=
-
1
;
// 初始化
fin
=
fopen
(
train_file
,
"rb"
);
if
(
fin
==
NULL
)
{
printf
(
"ERROR: training data file not found!
\n
"
);
exit
(
1
);
}
vocab_size
=
0
;
// 记录文件中的词的个数
AddWordToVocab
((
char
*
)
"</s>"
);
// 在最开始增加指定的词
// 开始从文本取每一个词
while
(
1
)
{
ReadWord
(
word
,
fin
);
// 读取每一个词
if
(
feof
(
fin
))
break
;
// 判断文件是否读完
train_words
++
;
// 记录词的个数
if
((
debug_mode
>
1
)
&&
(
train_words
%
100000
==
0
))
{
printf
(
"%lldK%c"
,
train_words
/
1000
,
13
);
fflush
(
stdout
);
}
i
=
SearchVocab
(
word
);
// 查找词在词库中的位置
if
(
i
==
-
1
)
{
// 没有查找到对应的词
a
=
AddWordToVocab
(
word
);
// 增加词
vocab
[
a
].
cn
=
1
;
// 设置词出现的次数为1
}
else
vocab
[
i
].
cn
++
;
// 设置词出现的次数+1
// 根据当前词的个数和设定的hash表的大小,删除低频词
if
(
vocab_size
>
vocab_hash_size
*
0
.
7
)
ReduceVocab
();
}
SortVocab
();
// 根据词出现的频率对词进行排序
if
(
debug_mode
>
0
)
{
printf
(
"Vocab size: %lld
\n
"
,
vocab_size
);
printf
(
"Words in train file: %lld
\n
"
,
train_words
);
}
file_size
=
ftell
(
fin
);
fclose
(
fin
);
}
// 保存词库
void
SaveVocab
()
{
long
long
i
;
FILE
*
fo
=
fopen
(
save_vocab_file
,
"wb"
);
// 保存词库时,保存的是词库中的词和词出现的次数
for
(
i
=
0
;
i
<
vocab_size
;
i
++
)
fprintf
(
fo
,
"%s %lld
\n
"
,
vocab
[
i
].
word
,
vocab
[
i
].
cn
);
fclose
(
fo
);
}
void
ReadVocab
()
{
long
long
a
,
i
=
0
;
char
c
;
char
word
[
MAX_STRING
];
FILE
*
fin
=
fopen
(
read_vocab_file
,
"rb"
);
if
(
fin
==
NULL
)
{
printf
(
"Vocabulary file not found
\n
"
);
exit
(
1
);
}
for
(
a
=
0
;
a
<
vocab_hash_size
;
a
++
)
vocab_hash
[
a
]
=
-
1
;
// 初始化vocab_hash
vocab_size
=
0
;
while
(
1
)
{
ReadWord
(
word
,
fin
);
if
(
feof
(
fin
))
break
;
a
=
AddWordToVocab
(
word
);
fscanf
(
fin
,
"%lld%c"
,
&
vocab
[
a
].
cn
,
&
c
);
i
++
;
}
SortVocab
();
if
(
debug_mode
>
0
)
{
printf
(
"Vocab size: %lld
\n
"
,
vocab_size
);
printf
(
"Words in train file: %lld
\n
"
,
train_words
);
}
fin
=
fopen
(
train_file
,
"rb"
);
if
(
fin
==
NULL
)
{
printf
(
"ERROR: training data file not found!
\n
"
);
exit
(
1
);
}
fseek
(
fin
,
0
,
SEEK_END
);
file_size
=
ftell
(
fin
);
fclose
(
fin
);
}
// 初始化网络
// 主要分为两个部分:1、对词向量的初始化;2、对映射层到输出层权重的初始化
void
InitNet
()
{
long
long
a
,
b
;
unsigned
long
long
next_random
=
1
;
// 为每一个词分配词向量的空间
// 对齐分配内存,posix_memalign函数的用法类似于malloc的用法,最后一个参数的分配的内存的大小
a
=
posix_memalign
((
void
**
)
&
syn0
,
128
,
(
long
long
)
vocab_size
*
layer1_size
*
sizeof
(
real
));
if
(
syn0
==
NULL
)
{
printf
(
"Memory allocation failed
\n
"
);
exit
(
1
);}
// 层次softmax的结构
if
(
hs
)
{
// 映射层到输出层之间的权重
a
=
posix_memalign
((
void
**
)
&
syn1
,
128
,
(
long
long
)
vocab_size
*
layer1_size
*
sizeof
(
real
));
if
(
syn1
==
NULL
)
{
printf
(
"Memory allocation failed
\n
"
);
exit
(
1
);}
for
(
a
=
0
;
a
<
vocab_size
;
a
++
)
for
(
b
=
0
;
b
<
layer1_size
;
b
++
)
syn1
[
a
*
layer1_size
+
b
]
=
0
;
// 权重初始化为0
}
// 负采样的结构
if
(
negative
>
0
)
{
a
=
posix_memalign
((
void
**
)
&
syn1neg
,
128
,
(
long
long
)
vocab_size
*
layer1_size
*
sizeof
(
real
));
if
(
syn1neg
==
NULL
)
{
printf
(
"Memory allocation failed
\n
"
);
exit
(
1
);}
for
(
a
=
0
;
a
<
vocab_size
;
a
++
)
for
(
b
=
0
;
b
<
layer1_size
;
b
++
)
syn1neg
[
a
*
layer1_size
+
b
]
=
0
;
}
// 随机初始化
for
(
a
=
0
;
a
<
vocab_size
;
a
++
)
for
(
b
=
0
;
b
<
layer1_size
;
b
++
)
{
next_random
=
next_random
*
(
unsigned
long
long
)
25214903917
+
11
;
// 1、与:相当于将数控制在一定范围内
// 2、0xFFFF:65536
// 3、/65536:[0,1]之间
syn0
[
a
*
layer1_size
+
b
]
=
(((
next_random
&
0xFFFF
)
/
(
real
)
65536
)
-
0
.
5
)
/
layer1_size
;
// 初始化词向量
}
// 构建Huffman树
CreateBinaryTree
();
}
void
*
TrainModelThread
(
void
*
id
)
{
long
long
a
,
b
,
d
,
cw
,
word
,
last_word
,
sentence_length
=
0
,
sentence_position
=
0
;
long
long
word_count
=
0
,
last_word_count
=
0
,
sen
[
MAX_SENTENCE_LENGTH
+
1
];
long
long
l1
,
l2
,
c
,
target
,
label
,
local_iter
=
iter
;
unsigned
long
long
next_random
=
(
long
long
)
id
;
real
f
,
g
;
clock_t
now
;
// layer1_size为词向量的长度
real
*
neu1
=
(
real
*
)
calloc
(
layer1_size
,
sizeof
(
real
));
// 存储映射层的结果
real
*
neu1e
=
(
real
*
)
calloc
(
layer1_size
,
sizeof
(
real
));
// skip-gram中使用到的向量
FILE
*
fi
=
fopen
(
train_file
,
"rb"
);
// 利用多线程对训练文件划分,每个线程训练一部分的数据
fseek
(
fi
,
file_size
/
(
long
long
)
num_threads
*
(
long
long
)
id
,
SEEK_SET
);
// 训练模型的核心部分
while
(
1
)
{
// 每处理10000个词重新计算学习率
if
(
word_count
-
last_word_count
>
10000
)
{
// 每处理10000个词重新计算学习率
word_count_actual
+=
word_count
-
last_word_count
;
last_word_count
=
word_count
;
if
((
debug_mode
>
1
))
{
now
=
clock
();
printf
(
"%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk "
,
13
,
alpha
,
word_count_actual
/
(
real
)(
iter
*
train_words
+
1
)
*
100
,
word_count_actual
/
((
real
)(
now
-
start
+
1
)
/
(
real
)
CLOCKS_PER_SEC
*
1000
));
fflush
(
stdout
);
}
// 重新计算alpha的值
alpha
=
starting_alpha
*
(
1
-
word_count_actual
/
(
real
)(
iter
*
train_words
+
1
));
// 防止学习率过小
if
(
alpha
<
starting_alpha
*
0
.
0001
)
alpha
=
starting_alpha
*
0
.
0001
;
}
// sentence_length=0表示的是当前还没有读取文本
// 开始读取文本,读取词的个数最多为MAX_SENTENCE_LENGTH
if
(
sentence_length
==
0
)
{
// 需要根据文件指针的位置读取相应的文本
while
(
1
)
{
word
=
ReadWordIndex
(
fi
);
// 词在词库中的索引
if
(
feof
(
fi
))
break
;
if
(
word
==
-
1
)
continue
;
// 没有查到该词
word_count
++
;
if
(
word
==
0
)
break
;
// The subsampling randomly discards frequent words while keeping the ranking same
if
(
sample
>
0
)
{
real
ran
=
(
sqrt
(
vocab
[
word
].
cn
/
(
sample
*
train_words
))
+
1
)
*
(
sample
*
train_words
)
/
vocab
[
word
].
cn
;
next_random
=
next_random
*
(
unsigned
long
long
)
25214903917
+
11
;
if
(
ran
<
(
next_random
&
0xFFFF
)
/
(
real
)
65536
)
continue
;
}
sen
[
sentence_length
]
=
word
;
// 存储词在词库中的位置,word代表的是Index
sentence_length
++
;
if
(
sentence_length
>=
MAX_SENTENCE_LENGTH
)
break
;
// 达到指定长度
}
sentence_position
=
0
;
// 将待处理的文本指针置0
}
// 当前的线程已经处理完分配给该线程的文本
if
(
feof
(
fi
)
||
(
word_count
>
train_words
/
num_threads
))
{
// 当前线程已经读完数据
word_count_actual
+=
word_count
-
last_word_count
;
// 当前线程的迭代次数
local_iter
--
;
if
(
local_iter
==
0
)
break
;
// 迭代结束
// 重新置0,准备下一次重新迭代
word_count
=
0
;
last_word_count
=
0
;
sentence_length
=
0
;
// 重置文件指针
fseek
(
fi
,
file_size
/
(
long
long
)
num_threads
*
(
long
long
)
id
,
SEEK_SET
);
continue
;
}
// sen表示的是当前的线程读取到的每一个词对应在词库中的索引
word
=
sen
[
sentence_position
];
//sentence_position表示的是当前词
if
(
word
==
-
1
)
continue
;
// 初始化映射层
for
(
c
=
0
;
c
<
layer1_size
;
c
++
)
neu1
[
c
]
=
0
;
// 映射层的结果
for
(
c
=
0
;
c
<
layer1_size
;
c
++
)
neu1e
[
c
]
=
0
;
// 产生一个0~window-1的随机数
next_random
=
next_random
*
(
unsigned
long
long
)
25214903917
+
11
;
b
=
next_random
%
window
;
// 模型的训练
if
(
cbow
)
{
// 训练CBOW模型
// in -> hidden
// 输入层到映射层
cw
=
0
;
for
(
a
=
b
;
a
<
window
*
2
+
1
-
b
;
a
++
)
if
(
a
!=
window
)
{
c
=
sentence_position
-
window
+
a
;
// sentence_position表示的是当前的位置
// 判断c是否越界
if
(
c
<
0
)
continue
;
if
(
c
>=
sentence_length
)
continue
;
last_word
=
sen
[
c
];
// 找到c对应的索引
if
(
last_word
==
-
1
)
continue
;
for
(
c
=
0
;
c
<
layer1_size
;
c
++
)
neu1
[
c
]
+=
syn0
[
c
+
last_word
*
layer1_size
];
// 累加
cw
++
;
}
if
(
cw
)
{
for
(
c
=
0
;
c
<
layer1_size
;
c
++
)
neu1
[
c
]
/=
cw
;
// 计算均值
// 计算的中心词是word
// 层次Softmax
if
(
hs
)
for
(
d
=
0
;
d
<
vocab
[
word
].
codelen
;
d
++
)
{
// word为当前词
// 计算输出层的输出
f
=
0
;
l2
=
vocab
[
word
].
point
[
d
]
*
layer1_size
;
// 找到第d个词对应的权重
// Propagate hidden -> output
for
(
c
=
0
;
c
<
layer1_size
;
c
++
)
f
+=
neu1
[
c
]
*
syn1
[
c
+
l2
];
// 映射层到输出层
if
(
f
<=
-
MAX_EXP
)
continue
;
else
if
(
f
>=
MAX_EXP
)
continue
;
else
f
=
expTable
[(
int
)((
f
+
MAX_EXP
)
*
(
EXP_TABLE_SIZE
/
MAX_EXP
/
2
))];
// Sigmoid结果
// 'g' is the gradient multiplied by the learning rate
g
=
(
1
-
vocab
[
word
].
code
[
d
]
-
f
)
*
alpha
;
// Propagate errors output -> hidden
for
(
c
=
0
;
c
<
layer1_size
;
c
++
)
neu1e
[
c
]
+=
g
*
syn1
[
c
+
l2
];
// 修改映射后的结果
// Learn weights hidden -> output
for
(
c
=
0
;
c
<
layer1_size
;
c
++
)
syn1
[
c
+
l2
]
+=
g
*
neu1
[
c
];
// 修改映射层到输出层之间的权重
}
// NEGATIVE SAMPLING
// 负采样
if
(
negative
>
0
)
for
(
d
=
0
;
d
<
negative
+
1
;
d
++
)
{
// 标记target和label
if
(
d
==
0
)
{
// 正样本
target
=
word
;
label
=
1
;
}
else
{
// 选择出负样本
next_random
=
next_random
*
(
unsigned
long
long
)
25214903917
+
11
;
target
=
table
[(
next_random
>>
16
)
%
table_size
];
// 从table表中选择出负样本
// 重新选择
if
(
target
==
0
)
target
=
next_random
%
(
vocab_size
-
1
)
+
1
;
if
(
target
==
word
)
continue
;
label
=
0
;
}
l2
=
target
*
layer1_size
;
f
=
0
;
for
(
c
=
0
;
c
<
layer1_size
;
c
++
)
f
+=
neu1
[
c
]
*
syn1neg
[
c
+
l2
];
// 映射层到输出层
// g
if
(
f
>
MAX_EXP
)
g
=
(
label
-
1
)
*
alpha
;
else
if
(
f
<
-
MAX_EXP
)
g
=
(
label
-
0
)
*
alpha
;
else
g
=
(
label
-
expTable
[(
int
)((
f
+
MAX_EXP
)
*
(
EXP_TABLE_SIZE
/
MAX_EXP
/
2
))])
*
alpha
;
for
(
c
=
0
;
c
<
layer1_size
;
c
++
)
neu1e
[
c
]
+=
g
*
syn1neg
[
c
+
l2
];
for
(
c
=
0
;
c
<
layer1_size
;
c
++
)
syn1neg
[
c
+
l2
]
+=
g
*
neu1
[
c
];
}
// hidden -> in
// 以上是从映射层到输出层的修改,现在返回修改每一个词向量
for
(
a
=
b
;
a
<
window
*
2
+
1
-
b
;
a
++
)
if
(
a
!=
window
)
{
c
=
sentence_position
-
window
+
a
;
if
(
c
<
0
)
continue
;
if
(
c
>=
sentence_length
)
continue
;
last_word
=
sen
[
c
];
if
(
last_word
==
-
1
)
continue
;
// 利用窗口内的所有词向量的梯度之和来更新
for
(
c
=
0
;
c
<
layer1_size
;
c
++
)
syn0
[
c
+
last_word
*
layer1_size
]
+=
neu1e
[
c
];
}
}
}
else
{
//train skip-gram 训练skip-gram模型
for
(
a
=
b
;
a
<
window
*
2
+
1
-
b
;
a
++
)
if
(
a
!=
window
)
{
c
=
sentence_position
-
window
+
a
;
if
(
c
<
0
)
continue
;
if
(
c
>=
sentence_length
)
continue
;
last_word
=
sen
[
c
];
if
(
last_word
==
-
1
)
continue
;
l1
=
last_word
*
layer1_size
;
for
(
c
=
0
;
c
<
layer1_size
;
c
++
)
neu1e
[
c
]
=
0
;
// HIERARCHICAL SOFTMAX
if
(
hs
)
for
(
d
=
0
;
d
<
vocab
[
word
].
codelen
;
d
++
)
{
f
=
0
;
l2
=
vocab
[
word
].
point
[
d
]
*
layer1_size
;
// Propagate hidden -> output
// 映射层即为输入层
for
(
c
=
0
;
c
<
layer1_size
;
c
++
)
f
+=
syn0
[
c
+
l1
]
*
syn1
[
c
+
l2
];
if
(
f
<=
-
MAX_EXP
)
continue
;
else
if
(
f
>=
MAX_EXP
)
continue
;
else
f
=
expTable
[(
int
)((
f
+
MAX_EXP
)
*
(
EXP_TABLE_SIZE
/
MAX_EXP
/
2
))];
// 'g' is the gradient multiplied by the learning rate
g
=
(
1
-
vocab
[
word
].
code
[
d
]
-
f
)
*
alpha
;
// Propagate errors output -> hidden
for
(
c
=
0
;
c
<
layer1_size
;
c
++
)
neu1e
[
c
]
+=
g
*
syn1
[
c
+
l2
];
// Learn weights hidden -> output
for
(
c
=
0
;
c
<
layer1_size
;
c
++
)
syn1
[
c
+
l2
]
+=
g
*
syn0
[
c
+
l1
];
}
// NEGATIVE SAMPLING
if
(
negative
>
0
)
for
(
d
=
0
;
d
<
negative
+
1
;
d
++
)
{
if
(
d
==
0
)
{
target
=
word
;
label
=
1
;
}
else
{
next_random
=
next_random
*
(
unsigned
long
long
)
25214903917
+
11
;
target
=
table
[(
next_random
>>
16
)
%
table_size
];
if
(
target
==
0
)
target
=
next_random
%
(
vocab_size
-
1
)
+
1
;
if
(
target
==
word
)
continue
;
label
=
0
;
}
l2
=
target
*
layer1_size
;
f
=
0
;
for
(
c
=
0
;
c
<
layer1_size
;
c
++
)
f
+=
syn0
[
c
+
l1
]
*
syn1neg
[
c
+
l2
];
if
(
f
>
MAX_EXP
)
g
=
(
label
-
1
)
*
alpha
;
else
if
(
f
<
-
MAX_EXP
)
g
=
(
label
-
0
)
*
alpha
;
else
g
=
(
label
-
expTable
[(
int
)((
f
+
MAX_EXP
)
*
(
EXP_TABLE_SIZE
/
MAX_EXP
/
2
))])
*
alpha
;
for
(
c
=
0
;
c
<
layer1_size
;
c
++
)
neu1e
[
c
]
+=
g
*
syn1neg
[
c
+
l2
];
for
(
c
=
0
;
c
<
layer1_size
;
c
++
)
syn1neg
[
c
+
l2
]
+=
g
*
syn0
[
c
+
l1
];
}
// Learn weights input -> hidden
for
(
c
=
0
;
c
<
layer1_size
;
c
++
)
syn0
[
c
+
l1
]
+=
neu1e
[
c
];
}
}
// 当已经处理完读入的所有文本,要重新继续往下读文本
sentence_position
++
;
if
(
sentence_position
>=
sentence_length
)
{
sentence_length
=
0
;
continue
;
}
}
fclose
(
fi
);
free
(
neu1
);
free
(
neu1e
);
pthread_exit
(
NULL
);
}
// 模型训练
void
TrainModel
()
{
long
a
,
b
,
c
,
d
;
FILE
*
fo
;
pthread_t
*
pt
=
(
pthread_t
*
)
malloc
(
num_threads
*
sizeof
(
pthread_t
));
// 多线程
printf
(
"Starting training using file %s
\n
"
,
train_file
);
starting_alpha
=
alpha
;
// 区分是否指定词库
// 若指定词库,则从词库中读入词
// 若不指定词库,则从文件中构建词库
if
(
read_vocab_file
[
0
]
!=
0
)
ReadVocab
();
// 指定词库
else
LearnVocabFromTrainFile
();
// 不指定词库,从文件中构建词库
if
(
save_vocab_file
[
0
]
!=
0
)
SaveVocab
();
// 判断是否需要保存词库
// 若没有指定输出文件,则退出
if
(
output_file
[
0
]
==
0
)
return
;
InitNet
();
// 初始化网络
if
(
negative
>
0
)
InitUnigramTable
();
// 利用负采样的方法
// 开始训练
start
=
clock
();
for
(
a
=
0
;
a
<
num_threads
;
a
++
)
pthread_create
(
&
pt
[
a
],
NULL
,
TrainModelThread
,
(
void
*
)
a
);
for
(
a
=
0
;
a
<
num_threads
;
a
++
)
pthread_join
(
pt
[
a
],
NULL
);
// 输出最终的训练结果
fo
=
fopen
(
output_file
,
"wb"
);
if
(
classes
==
0
)
{
// Save the word vectors
fprintf
(
fo
,
"%lld %lld
\n
"
,
vocab_size
,
layer1_size
);
for
(
a
=
0
;
a
<
vocab_size
;
a
++
)
{
fprintf
(
fo
,
"%s "
,
vocab
[
a
].
word
);
if
(
binary
)
for
(
b
=
0
;
b
<
layer1_size
;
b
++
)
fwrite
(
&
syn0
[
a
*
layer1_size
+
b
],
sizeof
(
real
),
1
,
fo
);
else
for
(
b
=
0
;
b
<
layer1_size
;
b
++
)
fprintf
(
fo
,
"%lf "
,
syn0
[
a
*
layer1_size
+
b
]);
fprintf
(
fo
,
"
\n
"
);
}
}
else
{
// Run K-means on the word vectors
int
clcn
=
classes
,
iter
=
10
,
closeid
;
int
*
centcn
=
(
int
*
)
malloc
(
classes
*
sizeof
(
int
));
int
*
cl
=
(
int
*
)
calloc
(
vocab_size
,
sizeof
(
int
));
real
closev
,
x
;
real
*
cent
=
(
real
*
)
calloc
(
classes
*
layer1_size
,
sizeof
(
real
));
for
(
a
=
0
;
a
<
vocab_size
;
a
++
)
cl
[
a
]
=
a
%
clcn
;
for
(
a
=
0
;
a
<
iter
;
a
++
)
{
for
(
b
=
0
;
b
<
clcn
*
layer1_size
;
b
++
)
cent
[
b
]
=
0
;
for
(
b
=
0
;
b
<
clcn
;
b
++
)
centcn
[
b
]
=
1
;
for
(
c
=
0
;
c
<
vocab_size
;
c
++
)
{
for
(
d
=
0
;
d
<
layer1_size
;
d
++
)
cent
[
layer1_size
*
cl
[
c
]
+
d
]
+=
syn0
[
c
*
layer1_size
+
d
];
centcn
[
cl
[
c
]]
++
;
}
for
(
b
=
0
;
b
<
clcn
;
b
++
)
{
closev
=
0
;
for
(
c
=
0
;
c
<
layer1_size
;
c
++
)
{
cent
[
layer1_size
*
b
+
c
]
/=
centcn
[
b
];
closev
+=
cent
[
layer1_size
*
b
+
c
]
*
cent
[
layer1_size
*
b
+
c
];
}
closev
=
sqrt
(
closev
);
for
(
c
=
0
;
c
<
layer1_size
;
c
++
)
cent
[
layer1_size
*
b
+
c
]
/=
closev
;
}
for
(
c
=
0
;
c
<
vocab_size
;
c
++
)
{
closev
=
-
10
;
closeid
=
0
;
for
(
d
=
0
;
d
<
clcn
;
d
++
)
{
x
=
0
;
for
(
b
=
0
;
b
<
layer1_size
;
b
++
)
x
+=
cent
[
layer1_size
*
d
+
b
]
*
syn0
[
c
*
layer1_size
+
b
];
if
(
x
>
closev
)
{
closev
=
x
;
closeid
=
d
;
}
}
cl
[
c
]
=
closeid
;
}
}
// Save the K-means classes
for
(
a
=
0
;
a
<
vocab_size
;
a
++
)
fprintf
(
fo
,
"%s %d
\n
"
,
vocab
[
a
].
word
,
cl
[
a
]);
free
(
centcn
);
free
(
cent
);
free
(
cl
);
}
fclose
(
fo
);
}
// 解析命令行
int
ArgPos
(
char
*
str
,
int
argc
,
char
**
argv
)
{
int
a
;
for
(
a
=
1
;
a
<
argc
;
a
++
)
if
(
!
strcmp
(
str
,
argv
[
a
]))
{
// 查找对应的参数
if
(
a
==
argc
-
1
)
{
printf
(
"Argument missing for %s
\n
"
,
str
);
exit
(
1
);
}
return
a
;
// 匹配成功,返回值所在的位置
}
return
-
1
;
}
int
main
(
int
argc
,
char
**
argv
)
{
int
i
;
// 判断参数的个数
if
(
argc
==
1
)
{
printf
(
"WORD VECTOR estimation toolkit v 0.1c
\n\n
"
);
printf
(
"Options:
\n
"
);
printf
(
"Parameters for training:
\n
"
);
printf
(
"
\t
-train <file>
\n
"
);
printf
(
"
\t\t
Use text data from <file> to train the model
\n
"
);
printf
(
"
\t
-output <file>
\n
"
);
printf
(
"
\t\t
Use <file> to save the resulting word vectors / word clusters
\n
"
);
printf
(
"
\t
-size <int>
\n
"
);
printf
(
"
\t\t
Set size of word vectors; default is 100
\n
"
);
printf
(
"
\t
-window <int>
\n
"
);
printf
(
"
\t\t
Set max skip length between words; default is 5
\n
"
);
printf
(
"
\t
-sample <float>
\n
"
);
printf
(
"
\t\t
Set threshold for occurrence of words. Those that appear with higher frequency in the training data
\n
"
);
printf
(
"
\t\t
will be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)
\n
"
);
printf
(
"
\t
-hs <int>
\n
"
);
printf
(
"
\t\t
Use Hierarchical Softmax; default is 0 (not used)
\n
"
);
printf
(
"
\t
-negative <int>
\n
"
);
printf
(
"
\t\t
Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)
\n
"
);
printf
(
"
\t
-threads <int>
\n
"
);
printf
(
"
\t\t
Use <int> threads (default 12)
\n
"
);
printf
(
"
\t
-iter <int>
\n
"
);
printf
(
"
\t\t
Run more training iterations (default 5)
\n
"
);
printf
(
"
\t
-min-count <int>
\n
"
);
printf
(
"
\t\t
This will discard words that appear less than <int> times; default is 5
\n
"
);
printf
(
"
\t
-alpha <float>
\n
"
);
printf
(
"
\t\t
Set the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW
\n
"
);
printf
(
"
\t
-classes <int>
\n
"
);
printf
(
"
\t\t
Output word classes rather than word vectors; default number of classes is 0 (vectors are written)
\n
"
);
printf
(
"
\t
-debug <int>
\n
"
);
printf
(
"
\t\t
Set the debug mode (default = 2 = more info during training)
\n
"
);
printf
(
"
\t
-binary <int>
\n
"
);
printf
(
"
\t\t
Save the resulting vectors in binary moded; default is 0 (off)
\n
"
);
printf
(
"
\t
-save-vocab <file>
\n
"
);
printf
(
"
\t\t
The vocabulary will be saved to <file>
\n
"
);
printf
(
"
\t
-read-vocab <file>
\n
"
);
printf
(
"
\t\t
The vocabulary will be read from <file>, not constructed from the training data
\n
"
);
printf
(
"
\t
-cbow <int>
\n
"
);
printf
(
"
\t\t
Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)
\n
"
);
printf
(
"
\n
Examples:
\n
"
);
printf
(
"./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3
\n\n
"
);
return
0
;
}
output_file
[
0
]
=
0
;
// 输出文件
save_vocab_file
[
0
]
=
0
;
// 输出词的文件
read_vocab_file
[
0
]
=
0
;
// 读入指定词的文件
// 解析word2vec所需用到的参数
if
((
i
=
ArgPos
((
char
*
)
"-size"
,
argc
,
argv
))
>
0
)
layer1_size
=
atoi
(
argv
[
i
+
1
]);
if
((
i
=
ArgPos
((
char
*
)
"-train"
,
argc
,
argv
))
>
0
)
strcpy
(
train_file
,
argv
[
i
+
1
]);
if
((
i
=
ArgPos
((
char
*
)
"-save-vocab"
,
argc
,
argv
))
>
0
)
strcpy
(
save_vocab_file
,
argv
[
i
+
1
]);
if
((
i
=
ArgPos
((
char
*
)
"-read-vocab"
,
argc
,
argv
))
>
0
)
strcpy
(
read_vocab_file
,
argv
[
i
+
1
]);
if
((
i
=
ArgPos
((
char
*
)
"-debug"
,
argc
,
argv
))
>
0
)
debug_mode
=
atoi
(
argv
[
i
+
1
]);
if
((
i
=
ArgPos
((
char
*
)
"-binary"
,
argc
,
argv
))
>
0
)
binary
=
atoi
(
argv
[
i
+
1
]);
if
((
i
=
ArgPos
((
char
*
)
"-cbow"
,
argc
,
argv
))
>
0
)
cbow
=
atoi
(
argv
[
i
+
1
]);
if
(
cbow
)
alpha
=
0
.
05
;
if
((
i
=
ArgPos
((
char
*
)
"-alpha"
,
argc
,
argv
))
>
0
)
alpha
=
atof
(
argv
[
i
+
1
]);
if
((
i
=
ArgPos
((
char
*
)
"-output"
,
argc
,
argv
))
>
0
)
strcpy
(
output_file
,
argv
[
i
+
1
]);
if
((
i
=
ArgPos
((
char
*
)
"-window"
,
argc
,
argv
))
>
0
)
window
=
atoi
(
argv
[
i
+
1
]);
if
((
i
=
ArgPos
((
char
*
)
"-sample"
,
argc
,
argv
))
>
0
)
sample
=
atof
(
argv
[
i
+
1
]);
if
((
i
=
ArgPos
((
char
*
)
"-hs"
,
argc
,
argv
))
>
0
)
hs
=
atoi
(
argv
[
i
+
1
]);
if
((
i
=
ArgPos
((
char
*
)
"-negative"
,
argc
,
argv
))
>
0
)
negative
=
atoi
(
argv
[
i
+
1
]);
if
((
i
=
ArgPos
((
char
*
)
"-threads"
,
argc
,
argv
))
>
0
)
num_threads
=
atoi
(
argv
[
i
+
1
]);
if
((
i
=
ArgPos
((
char
*
)
"-iter"
,
argc
,
argv
))
>
0
)
iter
=
atoi
(
argv
[
i
+
1
]);
if
((
i
=
ArgPos
((
char
*
)
"-min-count"
,
argc
,
argv
))
>
0
)
min_count
=
atoi
(
argv
[
i
+
1
]);
if
((
i
=
ArgPos
((
char
*
)
"-classes"
,
argc
,
argv
))
>
0
)
classes
=
atoi
(
argv
[
i
+
1
]);
vocab
=
(
struct
vocab_word
*
)
calloc
(
vocab_max_size
,
sizeof
(
struct
vocab_word
));
// 存储每一个词的结构体
vocab_hash
=
(
int
*
)
calloc
(
vocab_hash_size
,
sizeof
(
int
));
// 存储词的hash
expTable
=
(
real
*
)
malloc
((
EXP_TABLE_SIZE
+
1
)
*
sizeof
(
real
));
// 申请EXP_TABLE_SIZE+1个空间
// 计算sigmoid值
for
(
i
=
0
;
i
<
EXP_TABLE_SIZE
;
i
++
)
{
expTable
[
i
]
=
exp
((
i
/
(
real
)
EXP_TABLE_SIZE
*
2
-
1
)
*
MAX_EXP
);
// Precompute the exp() table
expTable
[
i
]
=
expTable
[
i
]
/
(
expTable
[
i
]
+
1
);
// Precompute f(x) = x / (x + 1)
}
// 开始模型训练
TrainModel
();
// 模型训练
return
0
;
}
©
2020
GitHub
,
Inc
.
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment