lucene.net 索引中的重复文档
Duplicate documents in lucene.net index
我正在使用 lucene.net 来索引我的 pdf 文件。刷新索引后,它将多次显示相同的 documnet(= 我刷新索引的次数)。
我正在使用最新版本的 lucene.net 索引 (Lucene.net 3.0.3)。
这是我的索引代码。
public void refreshIndexes()
{
// Create Index Writer
string strIndexDir = @"Z:\Munavvar\LuceneTest\index";
IndexWriter writer = new IndexWriter(Lucene.Net.Store.FSDirectory.Open(new System.IO.DirectoryInfo(strIndexDir)), new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29), true, IndexWriter.MaxFieldLength.UNLIMITED);
writer.DeleteAll();
// Find all files in root folder create index on them
List<string> lstFiles = searchFiles(@"Z:\Munavvar\LuceneTest\PDFs");
foreach (string strFile in lstFiles)
{
Document doc = new Document();
string FileName = System.IO.Path.GetFileNameWithoutExtension(strFile);
string Text = ExtractTextFromPdf(strFile);
string Path = strFile;
string ModifiedDate = Convert.ToString(File.GetLastWriteTime(strFile));
string DocumentType = string.Empty;
string Vault = string.Empty;
string headerText = Text.Substring(0, Text.Length < 150 ? Text.Length : 150);
foreach (var docs in ltDocumentTypes)
{
if (headerText.ToUpper().Contains(docs.searchText.ToUpper()))
{
DocumentType = docs.DocumentType;
Vault = docs.VaultName; ;
}
}
if (string.IsNullOrEmpty(DocumentType))
{
DocumentType = "Default";
Vault = "Default";
}
doc.Add(new Field("filename", FileName, Field.Store.YES, Field.Index.ANALYZED));
doc.Add(new Field("text", Text, Field.Store.YES, Field.Index.ANALYZED));
doc.Add(new Field("path", Path, Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.Add(new Field("modifieddate", ModifiedDate, Field.Store.YES, Field.Index.ANALYZED));
doc.Add(new Field("documenttype", DocumentType, Field.Store.YES, Field.Index.ANALYZED));
doc.Add(new Field("vault", Vault, Field.Store.YES, Field.Index.ANALYZED));
writer.AddDocument(doc);
}
writer.Optimize();
writer.Dispose();
}
这是我的索引搜索代码
public List<IndexDocument> searchFromIndexes(string searchText)
{
try
{
#region search in indexes and fill list
// Create list
List<IndexDocument> searchResult = new List<IndexDocument>();
if (!string.IsNullOrEmpty(searchText))
{
string strIndexDir = @"Z:\Munavvar\LuceneTest\index";
var analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30);
IndexSearcher searcher = new IndexSearcher(Lucene.Net.Store.FSDirectory.Open(new System.IO.DirectoryInfo(strIndexDir)));
// parse the query, "text" is the default field to search
Lucene.Net.QueryParsers.QueryParser parser = new Lucene.Net.QueryParsers.QueryParser(Lucene.Net.Util.Version.LUCENE_29, "text", analyzer);
Query query = parser.Parse(searchText);
// search
TopDocs hits = searcher.Search(query, searcher.MaxDoc);
// showing first TotalHits results
for (int i = 0; i < hits.TotalHits; i++)
{
// get the document from index
Document doc = searcher.Doc(hits.ScoreDocs[i].Doc);
// create a new row with the result data
searchResult.Add(new IndexDocument()
{
FileName = doc.Get("filename"),
Text = doc.Get("text"),
Path = doc.Get("path"),
ModifiedDate = doc.Get("modifieddate"),
Vault = doc.Get("vault"),
DocumentType = doc.Get("documenttype"),
});
}
searcher.Dispose();
}
return searchResult;
#endregion
}
catch (Exception ex)
{
throw ex;
}
}
更新
I have one button on window that call refreshIndexes method.
It will clear old index when I close and again run application and click that button
想出解决办法。
问题:
我正在从全局 class 对象调用 refreshIndexes 方法。
VaultIndexes vIndexes = new VaultIndexes();
private void btnRefreshIndex_Click(object sender, RoutedEventArgs e)
{
vIndexes.refreshIndexes();
}
解决方法:
每次都创建新的对象对象。
private void btnRefreshIndex_Click(object sender, RoutedEventArgs e)
{
VaultIndexes vIndexes = new VaultIndexes();
vIndexes.refreshIndexes();
}
I dont know why it is creating duplicate documents with global class
object.
As @RichaGarg state in comment, it must not create new document
according to 3rd argument of IndexWriter
IndexWriter writer = new IndexWriter(Lucene.Net.Store.FSDirectory.Open(new System.IO.DirectoryInfo(strIndexDir)), new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29), true, IndexWriter.MaxFieldLength.UNLIMITED);
我正在使用 lucene.net 来索引我的 pdf 文件。刷新索引后,它将多次显示相同的 documnet(= 我刷新索引的次数)。
我正在使用最新版本的 lucene.net 索引 (Lucene.net 3.0.3)。
这是我的索引代码。
public void refreshIndexes()
{
// Create Index Writer
string strIndexDir = @"Z:\Munavvar\LuceneTest\index";
IndexWriter writer = new IndexWriter(Lucene.Net.Store.FSDirectory.Open(new System.IO.DirectoryInfo(strIndexDir)), new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29), true, IndexWriter.MaxFieldLength.UNLIMITED);
writer.DeleteAll();
// Find all files in root folder create index on them
List<string> lstFiles = searchFiles(@"Z:\Munavvar\LuceneTest\PDFs");
foreach (string strFile in lstFiles)
{
Document doc = new Document();
string FileName = System.IO.Path.GetFileNameWithoutExtension(strFile);
string Text = ExtractTextFromPdf(strFile);
string Path = strFile;
string ModifiedDate = Convert.ToString(File.GetLastWriteTime(strFile));
string DocumentType = string.Empty;
string Vault = string.Empty;
string headerText = Text.Substring(0, Text.Length < 150 ? Text.Length : 150);
foreach (var docs in ltDocumentTypes)
{
if (headerText.ToUpper().Contains(docs.searchText.ToUpper()))
{
DocumentType = docs.DocumentType;
Vault = docs.VaultName; ;
}
}
if (string.IsNullOrEmpty(DocumentType))
{
DocumentType = "Default";
Vault = "Default";
}
doc.Add(new Field("filename", FileName, Field.Store.YES, Field.Index.ANALYZED));
doc.Add(new Field("text", Text, Field.Store.YES, Field.Index.ANALYZED));
doc.Add(new Field("path", Path, Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.Add(new Field("modifieddate", ModifiedDate, Field.Store.YES, Field.Index.ANALYZED));
doc.Add(new Field("documenttype", DocumentType, Field.Store.YES, Field.Index.ANALYZED));
doc.Add(new Field("vault", Vault, Field.Store.YES, Field.Index.ANALYZED));
writer.AddDocument(doc);
}
writer.Optimize();
writer.Dispose();
}
这是我的索引搜索代码
public List<IndexDocument> searchFromIndexes(string searchText)
{
try
{
#region search in indexes and fill list
// Create list
List<IndexDocument> searchResult = new List<IndexDocument>();
if (!string.IsNullOrEmpty(searchText))
{
string strIndexDir = @"Z:\Munavvar\LuceneTest\index";
var analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30);
IndexSearcher searcher = new IndexSearcher(Lucene.Net.Store.FSDirectory.Open(new System.IO.DirectoryInfo(strIndexDir)));
// parse the query, "text" is the default field to search
Lucene.Net.QueryParsers.QueryParser parser = new Lucene.Net.QueryParsers.QueryParser(Lucene.Net.Util.Version.LUCENE_29, "text", analyzer);
Query query = parser.Parse(searchText);
// search
TopDocs hits = searcher.Search(query, searcher.MaxDoc);
// showing first TotalHits results
for (int i = 0; i < hits.TotalHits; i++)
{
// get the document from index
Document doc = searcher.Doc(hits.ScoreDocs[i].Doc);
// create a new row with the result data
searchResult.Add(new IndexDocument()
{
FileName = doc.Get("filename"),
Text = doc.Get("text"),
Path = doc.Get("path"),
ModifiedDate = doc.Get("modifieddate"),
Vault = doc.Get("vault"),
DocumentType = doc.Get("documenttype"),
});
}
searcher.Dispose();
}
return searchResult;
#endregion
}
catch (Exception ex)
{
throw ex;
}
}
更新
I have one button on window that call refreshIndexes method.
It will clear old index when I close and again run application and click that button
想出解决办法。
问题: 我正在从全局 class 对象调用 refreshIndexes 方法。
VaultIndexes vIndexes = new VaultIndexes();
private void btnRefreshIndex_Click(object sender, RoutedEventArgs e)
{
vIndexes.refreshIndexes();
}
解决方法: 每次都创建新的对象对象。
private void btnRefreshIndex_Click(object sender, RoutedEventArgs e)
{
VaultIndexes vIndexes = new VaultIndexes();
vIndexes.refreshIndexes();
}
I dont know why it is creating duplicate documents with global class object.
As @RichaGarg state in comment, it must not create new document according to 3rd argument of IndexWriter
IndexWriter writer = new IndexWriter(Lucene.Net.Store.FSDirectory.Open(new System.IO.DirectoryInfo(strIndexDir)), new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29), true, IndexWriter.MaxFieldLength.UNLIMITED);