运行 哈希图特征向量上的感知器算法:java
run perceptron algorithm on a hash map feature vecteur: java
我有以下代码,它将目录中的许多文件读入哈希映射,这是我的feature vecteur。从某种意义上说,它没有词干,这有点天真,但这不是我现在最关心的问题。我想知道如何使用此数据结构作为感知器算法的输入。我想我们称之为词袋,不是吗?
public class BagOfWords
{
static Map<String, Integer> bag_of_words = new HashMap<>();
public static void main(String[] args) throws IOException
{
String path = "/home/flavius/atheism;
File file = new File( path );
new BagOfWords().iterateDirectory(file);
for (Map.Entry<String, Integer> entry : bag_of_words.entrySet())
{
System.out.println(entry.getKey()+" : "+entry.getValue());
}
}
private void iterateDirectory(File file) throws IOException
{
for (File f : file.listFiles())
{
if (f.isDirectory())
{
iterateDirectory(file);
}
else
{
String line;
BufferedReader br = new BufferedReader(new FileReader( f ));
while ((line = br.readLine()) != null)
{
String[] words = line.split(" ");//those are your words
String word;
for (int i = 0; i < words.length; i++)
{
word = words[i];
if (!bag_of_words.containsKey(word))
{
bag_of_words.put(word, 0);
}
bag_of_words.put(word, bag_of_words.get(word) + 1);
}
}
}
}
}
}
你可以看到路径到一个目录叫'atheism'还有一个叫sports,我想尝试线性分离这两个类的文件,然后尝试分离看不见的测试文档分为任一类别。
怎么做?如何将其概念化。我会很感激可靠的参考、全面的解释或某种伪代码。
我在网上找不到很多信息丰富且清晰的参考资料。
让我们先建立一些词汇表(我猜你正在使用 20 新闻组数据集):
- "Class Label" 是您要预测的,在您的二进制情况下,这是 "atheism" 与其余
- "Feature vector" 这就是您输入 classifier
的内容
- "Document" 这是来自数据集
的一封电子邮件
- "Token" 文档的一小部分,通常是 unigram/bigram/trigram
- "Dictionary" 一组 "allowed" 个单词作为你的向量
所以词袋的向量化算法通常遵循以下步骤:
- 遍历所有文档(跨越所有 class 标签)并收集所有标记,这是您的字典和特征向量的维度
- 再次检查所有文档,并针对每个文档执行以下操作:
- 创建一个具有字典维度的新特征向量(例如 200,对应该字典中的 200 个条目)
- 遍历该文档中的所有标记并在特征向量的这个维度上设置字数(在该文档中)
- 您现在有一个特征向量列表,您可以将其输入到您的算法中
示例:
Document 1 = ["I", "am", "awesome"]
Document 2 = ["I", "am", "great", "great"]
字典是:
["I", "am", "awesome", "great"]
所以作为矢量的文档看起来像:
Document 1 = [1, 1, 1, 0]
Document 2 = [1, 1, 0, 2]
有了它,您就可以做各种奇特的数学运算,并将其输入您的感知器。
这是对我最初问题的完整回答,张贴在这里是为了未来的读者
给定以下文件:
atheism/a_0.txt
Gott ist tot.
politics/p_0.txt
L'Etat, c'est moi , et aussi moi .
science/s_0.txt
If I have seen further it is by standing on the shoulders of giants.
sports/s_1.txt
You miss 100% of the shots you don't take.
输出数据结构:
/data/train/politics/p_0.txt, [0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]
/data/train/science/s_0.txt, [1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0]
/data/train/atheism/a_0.txt, [0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
/data/train/sports/s_1.txt, [0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1]
代码看起来像这样,或者您可以在 my GitHub page.
上找到它
public class FileDictCreateur
{
static String PATH = "/home/matthias/Workbench/SUTD/ISTD_50.570/assignments/practice_data/data/train";
//the global list of all words across all articles
static Set<String> GLOBO_DICT = new HashSet<String>();
//is the globo dict full?
static boolean globo_dict_fixed = false;
// hash map of all the words contained in individual files
static Map<File, ArrayList<String> > fileDict = new HashMap<>();
//input to perceptron. final struc.
static Map<File, int[] > perceptron_input = new HashMap<>();
@SuppressWarnings("rawtypes")
public static void main(String[] args) throws IOException
{
//each of the diferent categories
String[] categories = { "/atheism", "/politics", "/science", "/sports"};
//cycle through all categories once to populate the global dict
for(int cycle = 0; cycle <= 3; cycle++)
{
String general_data_partition = PATH + categories[cycle];
File directory = new File( general_data_partition );
iterateDirectory( directory , globo_dict_fixed);
if(cycle == 3)
globo_dict_fixed = true;
}
//cycle through again to populate the file dicts
for(int cycle = 0; cycle <= 3; cycle++)
{
String general_data_partition = PATH + categories[cycle];
File directory = new File( general_data_partition );
iterateDirectory( directory , globo_dict_fixed);
}
perceptron_data_struc_generateur( GLOBO_DICT, fileDict, perceptron_input );
//print the output
for (Map.Entry<File, int[]> entry : perceptron_input.entrySet())
{
System.out.println(entry.getKey() + ", " + Arrays.toString(entry.getValue()));
}
}
private static void iterateDirectory(File directory, boolean globo_dict_fixed) throws IOException
{
for (File file : directory.listFiles())
{
if (file.isDirectory())
{
iterateDirectory(directory, globo_dict_fixed);
}
else
{
String line;
BufferedReader br = new BufferedReader(new FileReader( file ));
while ((line = br.readLine()) != null)
{
String[] words = line.split(" ");//those are your words
if(globo_dict_fixed == false)
{
populate_globo_dict( words );
}
else
{
create_file_dict( file, words );
}
}
}
}
}
@SuppressWarnings("unchecked")
public static void create_file_dict( File file, String[] words ) throws IOException
{
if (!fileDict.containsKey(file))
{
@SuppressWarnings("rawtypes")
ArrayList document_words = new ArrayList<String>();
String word;
for (int i = 0; i < words.length; i++)
{
word = words[i];
document_words.add(word);
}
fileDict.put(file, document_words);
}
}
public static void populate_globo_dict( String[] words ) throws IOException
{
String word;
for (int i = 0; i < words.length; i++)
{
word = words[i];
if (!GLOBO_DICT.contains(word))
{
GLOBO_DICT.add(word);
}
}
}
public static void perceptron_data_struc_generateur(Set<String> GLOBO_DICT,
Map<File, ArrayList<String> > fileDict,
Map<File, int[] > perceptron_input)
{
//create a new entry in the array list 'perceptron_input'
//with the key as the file name from fileDict
//create a new array which is the length of GLOBO_DICT
//iterate through the indicies of GLOBO_DICT
//for all words in globo dict, if that word appears in fileDict,
//increment the perceptron_input index that corresponds to that
//word in GLOBO_DICT by the number of times that word appears in fileDict
//so i can get the index later
List<String> GLOBO_DICT_list = new ArrayList<>(GLOBO_DICT);
for (Map.Entry<File, ArrayList<String>> entry : fileDict.entrySet())
{
int[] cross_czech = new int[GLOBO_DICT_list.size()];
//initialize to zero
Arrays.fill(cross_czech, 0);
for (String s : GLOBO_DICT_list)
{
for(String st : entry.getValue())
{
if( st.equals(s) )
{
cross_czech[ GLOBO_DICT_list.indexOf( s ) ] = cross_czech[ GLOBO_DICT_list.indexOf( s ) ] +1;
}
}
}
perceptron_input.put( entry.getKey() , cross_czech);
}
}
}
我有以下代码,它将目录中的许多文件读入哈希映射,这是我的feature vecteur。从某种意义上说,它没有词干,这有点天真,但这不是我现在最关心的问题。我想知道如何使用此数据结构作为感知器算法的输入。我想我们称之为词袋,不是吗?
public class BagOfWords
{
static Map<String, Integer> bag_of_words = new HashMap<>();
public static void main(String[] args) throws IOException
{
String path = "/home/flavius/atheism;
File file = new File( path );
new BagOfWords().iterateDirectory(file);
for (Map.Entry<String, Integer> entry : bag_of_words.entrySet())
{
System.out.println(entry.getKey()+" : "+entry.getValue());
}
}
private void iterateDirectory(File file) throws IOException
{
for (File f : file.listFiles())
{
if (f.isDirectory())
{
iterateDirectory(file);
}
else
{
String line;
BufferedReader br = new BufferedReader(new FileReader( f ));
while ((line = br.readLine()) != null)
{
String[] words = line.split(" ");//those are your words
String word;
for (int i = 0; i < words.length; i++)
{
word = words[i];
if (!bag_of_words.containsKey(word))
{
bag_of_words.put(word, 0);
}
bag_of_words.put(word, bag_of_words.get(word) + 1);
}
}
}
}
}
}
你可以看到路径到一个目录叫'atheism'还有一个叫sports,我想尝试线性分离这两个类的文件,然后尝试分离看不见的测试文档分为任一类别。
怎么做?如何将其概念化。我会很感激可靠的参考、全面的解释或某种伪代码。
我在网上找不到很多信息丰富且清晰的参考资料。
让我们先建立一些词汇表(我猜你正在使用 20 新闻组数据集):
- "Class Label" 是您要预测的,在您的二进制情况下,这是 "atheism" 与其余
- "Feature vector" 这就是您输入 classifier 的内容
- "Document" 这是来自数据集 的一封电子邮件
- "Token" 文档的一小部分,通常是 unigram/bigram/trigram
- "Dictionary" 一组 "allowed" 个单词作为你的向量
所以词袋的向量化算法通常遵循以下步骤:
- 遍历所有文档(跨越所有 class 标签)并收集所有标记,这是您的字典和特征向量的维度
- 再次检查所有文档,并针对每个文档执行以下操作:
- 创建一个具有字典维度的新特征向量(例如 200,对应该字典中的 200 个条目)
- 遍历该文档中的所有标记并在特征向量的这个维度上设置字数(在该文档中)
- 您现在有一个特征向量列表,您可以将其输入到您的算法中
示例:
Document 1 = ["I", "am", "awesome"]
Document 2 = ["I", "am", "great", "great"]
字典是:
["I", "am", "awesome", "great"]
所以作为矢量的文档看起来像:
Document 1 = [1, 1, 1, 0]
Document 2 = [1, 1, 0, 2]
有了它,您就可以做各种奇特的数学运算,并将其输入您的感知器。
这是对我最初问题的完整回答,张贴在这里是为了未来的读者
给定以下文件:
atheism/a_0.txt
Gott ist tot.
politics/p_0.txt
L'Etat, c'est moi , et aussi moi .
science/s_0.txt
If I have seen further it is by standing on the shoulders of giants.
sports/s_1.txt
You miss 100% of the shots you don't take.
输出数据结构:
/data/train/politics/p_0.txt, [0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0] /data/train/science/s_0.txt, [1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0] /data/train/atheism/a_0.txt, [0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] /data/train/sports/s_1.txt, [0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1]
代码看起来像这样,或者您可以在 my GitHub page.
上找到它public class FileDictCreateur
{
static String PATH = "/home/matthias/Workbench/SUTD/ISTD_50.570/assignments/practice_data/data/train";
//the global list of all words across all articles
static Set<String> GLOBO_DICT = new HashSet<String>();
//is the globo dict full?
static boolean globo_dict_fixed = false;
// hash map of all the words contained in individual files
static Map<File, ArrayList<String> > fileDict = new HashMap<>();
//input to perceptron. final struc.
static Map<File, int[] > perceptron_input = new HashMap<>();
@SuppressWarnings("rawtypes")
public static void main(String[] args) throws IOException
{
//each of the diferent categories
String[] categories = { "/atheism", "/politics", "/science", "/sports"};
//cycle through all categories once to populate the global dict
for(int cycle = 0; cycle <= 3; cycle++)
{
String general_data_partition = PATH + categories[cycle];
File directory = new File( general_data_partition );
iterateDirectory( directory , globo_dict_fixed);
if(cycle == 3)
globo_dict_fixed = true;
}
//cycle through again to populate the file dicts
for(int cycle = 0; cycle <= 3; cycle++)
{
String general_data_partition = PATH + categories[cycle];
File directory = new File( general_data_partition );
iterateDirectory( directory , globo_dict_fixed);
}
perceptron_data_struc_generateur( GLOBO_DICT, fileDict, perceptron_input );
//print the output
for (Map.Entry<File, int[]> entry : perceptron_input.entrySet())
{
System.out.println(entry.getKey() + ", " + Arrays.toString(entry.getValue()));
}
}
private static void iterateDirectory(File directory, boolean globo_dict_fixed) throws IOException
{
for (File file : directory.listFiles())
{
if (file.isDirectory())
{
iterateDirectory(directory, globo_dict_fixed);
}
else
{
String line;
BufferedReader br = new BufferedReader(new FileReader( file ));
while ((line = br.readLine()) != null)
{
String[] words = line.split(" ");//those are your words
if(globo_dict_fixed == false)
{
populate_globo_dict( words );
}
else
{
create_file_dict( file, words );
}
}
}
}
}
@SuppressWarnings("unchecked")
public static void create_file_dict( File file, String[] words ) throws IOException
{
if (!fileDict.containsKey(file))
{
@SuppressWarnings("rawtypes")
ArrayList document_words = new ArrayList<String>();
String word;
for (int i = 0; i < words.length; i++)
{
word = words[i];
document_words.add(word);
}
fileDict.put(file, document_words);
}
}
public static void populate_globo_dict( String[] words ) throws IOException
{
String word;
for (int i = 0; i < words.length; i++)
{
word = words[i];
if (!GLOBO_DICT.contains(word))
{
GLOBO_DICT.add(word);
}
}
}
public static void perceptron_data_struc_generateur(Set<String> GLOBO_DICT,
Map<File, ArrayList<String> > fileDict,
Map<File, int[] > perceptron_input)
{
//create a new entry in the array list 'perceptron_input'
//with the key as the file name from fileDict
//create a new array which is the length of GLOBO_DICT
//iterate through the indicies of GLOBO_DICT
//for all words in globo dict, if that word appears in fileDict,
//increment the perceptron_input index that corresponds to that
//word in GLOBO_DICT by the number of times that word appears in fileDict
//so i can get the index later
List<String> GLOBO_DICT_list = new ArrayList<>(GLOBO_DICT);
for (Map.Entry<File, ArrayList<String>> entry : fileDict.entrySet())
{
int[] cross_czech = new int[GLOBO_DICT_list.size()];
//initialize to zero
Arrays.fill(cross_czech, 0);
for (String s : GLOBO_DICT_list)
{
for(String st : entry.getValue())
{
if( st.equals(s) )
{
cross_czech[ GLOBO_DICT_list.indexOf( s ) ] = cross_czech[ GLOBO_DICT_list.indexOf( s ) ] +1;
}
}
}
perceptron_input.put( entry.getKey() , cross_czech);
}
}
}