lucene.net 索引中的重复文档

Duplicate documents in lucene.net index

我正在使用 lucene.net 来索引我的 pdf 文件。刷新索引后,它将多次显示相同的 documnet(= 我刷新索引的次数)。

我正在使用最新版本的 lucene.net 索引 (Lucene.net 3.0.3)。

这是我的索引代码。

public void refreshIndexes()  
    {
        // Create Index Writer
        string strIndexDir = @"Z:\Munavvar\LuceneTest\index";
        IndexWriter writer = new IndexWriter(Lucene.Net.Store.FSDirectory.Open(new System.IO.DirectoryInfo(strIndexDir)), new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29), true, IndexWriter.MaxFieldLength.UNLIMITED);

        writer.DeleteAll();
        // Find all files in root folder create index on them
        List<string> lstFiles = searchFiles(@"Z:\Munavvar\LuceneTest\PDFs");
        foreach (string strFile in lstFiles)
        {
            Document doc = new Document();
            string FileName = System.IO.Path.GetFileNameWithoutExtension(strFile);
            string Text = ExtractTextFromPdf(strFile);
            string Path = strFile;
            string ModifiedDate = Convert.ToString(File.GetLastWriteTime(strFile));
            string DocumentType = string.Empty;
            string Vault = string.Empty;

            string headerText = Text.Substring(0, Text.Length < 150 ? Text.Length : 150);
            foreach (var docs in ltDocumentTypes)
            {
                if (headerText.ToUpper().Contains(docs.searchText.ToUpper()))
                {
                    DocumentType = docs.DocumentType;
                    Vault = docs.VaultName; ;
                }
            }

            if (string.IsNullOrEmpty(DocumentType))
            {
                DocumentType = "Default";
                Vault = "Default";
            }

            doc.Add(new Field("filename", FileName, Field.Store.YES, Field.Index.ANALYZED));
            doc.Add(new Field("text", Text, Field.Store.YES, Field.Index.ANALYZED));
            doc.Add(new Field("path", Path, Field.Store.YES, Field.Index.NOT_ANALYZED));
            doc.Add(new Field("modifieddate", ModifiedDate, Field.Store.YES, Field.Index.ANALYZED));
            doc.Add(new Field("documenttype", DocumentType, Field.Store.YES, Field.Index.ANALYZED));
            doc.Add(new Field("vault", Vault, Field.Store.YES, Field.Index.ANALYZED));

            writer.AddDocument(doc);
        }
        writer.Optimize();
        writer.Dispose();
    }

这是我的索引搜索代码

public List<IndexDocument> searchFromIndexes(string searchText)
    {
        try
        {
            #region search in indexes and fill list
            // Create list
            List<IndexDocument> searchResult = new List<IndexDocument>();

            if (!string.IsNullOrEmpty(searchText))
            {
                string strIndexDir = @"Z:\Munavvar\LuceneTest\index";
                var analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30);
                IndexSearcher searcher = new IndexSearcher(Lucene.Net.Store.FSDirectory.Open(new System.IO.DirectoryInfo(strIndexDir)));

                // parse the query, "text" is the default field to search
                Lucene.Net.QueryParsers.QueryParser parser = new Lucene.Net.QueryParsers.QueryParser(Lucene.Net.Util.Version.LUCENE_29, "text", analyzer);


                Query query = parser.Parse(searchText);

                // search
                TopDocs hits = searcher.Search(query, searcher.MaxDoc);

                // showing first TotalHits results
                for (int i = 0; i < hits.TotalHits; i++)
                {
                    // get the document from index
                    Document doc = searcher.Doc(hits.ScoreDocs[i].Doc);

                    // create a new row with the result data
                    searchResult.Add(new IndexDocument()
                        {
                            FileName = doc.Get("filename"),
                            Text = doc.Get("text"),
                            Path = doc.Get("path"),
                            ModifiedDate = doc.Get("modifieddate"),
                            Vault = doc.Get("vault"),
                            DocumentType = doc.Get("documenttype"),
                        });

                }
                searcher.Dispose();
            }
            return searchResult;
            #endregion

        }
        catch (Exception ex)
        {
            throw ex;
        }
    }

更新

I have one button on window that call refreshIndexes method.

It will clear old index when I close and again run application and click that button

想出解决办法。

问题: 我正在从全局 class 对象调用 refreshIndexes 方法。

VaultIndexes vIndexes = new VaultIndexes();
private void btnRefreshIndex_Click(object sender, RoutedEventArgs e)
{
    vIndexes.refreshIndexes();
}

解决方法: 每次都创建新的对象对象。

private void btnRefreshIndex_Click(object sender, RoutedEventArgs e)
{
    VaultIndexes vIndexes = new VaultIndexes();
    vIndexes.refreshIndexes();
}

I dont know why it is creating duplicate documents with global class object.

As @RichaGarg state in comment, it must not create new document according to 3rd argument of IndexWriter

IndexWriter writer = new IndexWriter(Lucene.Net.Store.FSDirectory.Open(new System.IO.DirectoryInfo(strIndexDir)), new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29), true, IndexWriter.MaxFieldLength.UNLIMITED);