How activate stemming on my Lucene search code - c#

Can someone please help activate stemming on my code. Tried a lot but without much success :(
My current code
Directory createIndex(DataTable table)
{
var directory = new RAMDirectory();
using (Analyzer analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30))
using (var writer = new IndexWriter(directory, analyzer, new IndexWriter.MaxFieldLength(1000)))
{
foreach (DataRow row in table.Rows)
{
var document = new Document();
document.Add(new Field("DishName", row["DishName"].ToString(), Field.Store.YES, Field.Index.ANALYZED));
document.Add(new Field("CustomisationID", row["CustomisationID"].ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
writer.AddDocument(document);
}
writer.Optimize();
writer.Flush(true, true, true);
}
return directory;
}
private DataTable SearchDishName(string textSearch)
{
string MatchingCutomisationIDs = "0"; //There is no Dish with ID zero, this is just to easen the coding..
var ds = new DataSet();
ds.ReadXml(System.Web.HttpContext.Current.Server.MapPath("~/App_data/MyDataset.xml"));
DataTable Sample = new DataTable();
Sample = ds.Tables[0];
var table = Sample.Clone();
var Index = createIndex(Sample);
using (var reader = IndexReader.Open(Index, true))
using (var searcher = new IndexSearcher(reader))
{
using (Analyzer analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30))
{
var queryParser = new QueryParser(Lucene.Net.Util.Version.LUCENE_30, "DishName", analyzer);
var collector = TopScoreDocCollector.Create(1000, true);
try
{
var query = queryParser.Parse(textSearch);
searcher.Search(query, collector);
}
catch
{ }
var matches = collector.TopDocs().ScoreDocs;
foreach (var item in matches)
{
var id = item.Doc;
var doc = searcher.Doc(id);
var row = table.NewRow();
row["CustomisationID"] = doc.GetField("CustomisationID").StringValue;
table.Rows.Add(row);
}
}
}
return table;
}

StandardAnalyzer does not include stemming. Use SnowballAnalyzer instead. Such as, for English text:
Analyzer analyzer = new SnowballAnalyzer(Lucene.Net.Util.Version.LUCENE_30, "English")

Related

How do I build a function that transforms the columns entered from the DataTable to xml?

There is no problem in converting static columns to xml
The problem is that I'm trying to make the conversion through an array and insert it as I need it every time
can you add
loop to XElement
string creatColumnXml(params int[] ColId)
{
return new XElement(table.TableName, table.Rows.Cast<DataRow>().Select(row =>
new XElement("row",
//for ( int i =0;i<ColId.Length;i++)
//new XElement(table.Columns[ColId[i]].ColumnName, row[ColId[i]]),
new XElement(table.Columns[0].ColumnName, row[0]),
new XElement(table.Columns[1].ColumnName, row[1]),
new XElement(table.Columns[2].ColumnName, row[2])
))
).ToString();
}
You can use LINQ again within your existing lambda
string creatColumnXml(params int[] ColId)
{
return new XElement(table.TableName,
table.Rows.Cast<DataRow>().Select(row =>
new XElement("row",
ColId.Select(c => new XElement(table.Columns[c].ColumnName, row[c]))
)
).ToString();
}
I found the solution
XElement[] c = new XElement[ColId.Length];
XElement[] r = new XElement[table.Rows.Count];
int y = 0;
string s="" ;
foreach (DataRow row in table.Rows)
{
for (int i = 0; i < ColId.Length; i++)
{
c[i] = new XElement(table.Columns[ColId[i]].ColumnName, row[ColId[i]]);
}
r[y] = new XElement("row",c);
y++;
}
s = new XElement(table.TableName, r).ToString();
Assuming you got your DataRows from a DataSet, you can just directly convert the DataSet to XML using the DataSet.WriteXml method.
Here's an example, hope this is helpful.
DataSet MainDS = new DataSet();
SqlDataAdapter da = new SqlDataAdapter();
da.SelectCommand = new SqlCommand( "select name, city, country from user" );
da.Fill(MainDS, "row");
XmlDocument xml = new XmlDocument();
using (MemoryStream ms = new MemoryStream()) {
try {
XmlWriterSettings settings = new XmlWriterSettings();
settings.Encoding = Encoding.UTF8;
settings.CheckCharacters = false;
using (XmlWriter xmlWriter = XmlWriter.Create(ms, settings)) {
MainDS.WriteXml(xmlWriter);
xmlWriter.Close();
}
ms.Position = 0;
xml.Load(ms);
} catch (Exception err) {
throw new Exception("Error loading dataset to xml", err);
}
}
From there, use XSL to format.
You can also direct .NET to create attribute based XML.
foreach (System.Data.DataTable dt in ds.Tables) {
foreach (DataColumn dc in dt.Columns) {
dc.ColumnMapping = MappingType.Attribute;
}
}

Lucene.Net (4.8) AutoComplete / AutoSuggestion

I'd like to implement a searchable index using Lucene.Net 4.8 that supplies a user with suggestions / autocomplete for single words & phrases.
The index has been created successfully; the suggestions are where I've stalled.
Version 4.8 seems to have introduced a substantial number of breaking changes, and none of the available samples I've found work.
Where I stand
For reference, LuceneVersion is this:
private readonly LuceneVersion LuceneVersion = LuceneVersion.LUCENE_48;
Solution 1
I've tried this, but can't get past reader.Terms:
public void TryAutoComplete()
{
var analyzer = new EnglishAnalyzer(LuceneVersion);
var config = new IndexWriterConfig(LuceneVersion, analyzer);
RAMDirectory dir = new RAMDirectory();
using (IndexWriter iw = new IndexWriter(dir, config))
{
Document d = new Document();
TextField f = new TextField("text","",Field.Store.YES);
d.Add(f);
f.SetStringValue("abc");
iw.AddDocument(d);
f.SetStringValue("colorado");
iw.AddDocument(d);
f.SetStringValue("coloring book");
iw.AddDocument(d);
iw.Commit();
using (IndexReader reader = iw.GetReader(false))
{
TermEnum terms = reader.Terms(new Term("text", "co"));
int maxSuggestsCpt = 0;
// will print:
// colorado
// coloring book
do
{
Console.WriteLine(terms.Term.Text);
maxSuggestsCpt++;
if (maxSuggestsCpt >= 5)
break;
}
while (terms.Next() && terms.Term.Text.StartsWith("co"));
}
}
}
reader.Terms no longer exists. Being new to Lucene, it's unclear how to refactor this.
Solution 2
Trying this, I'm thrown an error:
public void TryAutoComplete2()
{
using(var analyzer = new EnglishAnalyzer(LuceneVersion))
{
IndexWriterConfig config = new IndexWriterConfig(LuceneVersion, analyzer);
RAMDirectory dir = new RAMDirectory();
using(var iw = new IndexWriter(dir,config))
{
Document d = new Document()
{
new TextField("text", "this is a document with a some words",Field.Store.YES),
new Int32Field("id", 42, Field.Store.YES)
};
iw.AddDocument(d);
iw.Commit();
using (IndexReader reader = iw.GetReader(false))
using (SpellChecker speller = new SpellChecker(new RAMDirectory()))
{
//ERROR HERE!!!
speller.IndexDictionary(new LuceneDictionary(reader, "text"), config, false);
string[] suggestions = speller.SuggestSimilar("dcument", 5);
IndexSearcher searcher = new IndexSearcher(reader);
foreach (string suggestion in suggestions)
{
TopDocs docs = searcher.Search(new TermQuery(new Term("text", suggestion)), null, Int32.MaxValue);
foreach (var doc in docs.ScoreDocs)
{
System.Diagnostics.Debug.WriteLine(searcher.Doc(doc.Doc).Get("id"));
}
}
}
}
}
}
When debugging, speller.IndexDictionary(new LuceneDictionary(reader, "text"), config, false); throws a The object cannot be set twice! error, which I can't explain.
Any thoughts are welcome.
Clarification
I'd like to return a list of suggested terms for a given input, not the documents or their full content.
For example, if a document contains "Hello, my name is Clark. I'm from Atlanta," and I submit "Atl," then "Atlanta" should come back as a suggestion.
If I am understanding you correctly you may be over-complicating your index design a bit. If your goal is to use Lucene for auto-complete, you want to create an index of the terms you consider complete. Then simply query the index using a PrefixQuery using a partial word or phrase.
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.En;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.Search;
using Lucene.Net.Store;
using Lucene.Net.Util;
using System;
using System.Linq;
namespace LuceneDemoApp
{
class LuceneAutoCompleteIndex : IDisposable
{
const LuceneVersion Version = LuceneVersion.LUCENE_48;
RAMDirectory Directory;
Analyzer Analyzer;
IndexWriterConfig WriterConfig;
private void IndexDoc(IndexWriter writer, string term)
{
Document doc = new Document();
doc.Add(new StringField(FieldName, term, Field.Store.YES));
writer.AddDocument(doc);
}
public LuceneAutoCompleteIndex(string fieldName, int maxResults)
{
FieldName = fieldName;
MaxResults = maxResults;
Directory = new RAMDirectory();
Analyzer = new EnglishAnalyzer(Version);
WriterConfig = new IndexWriterConfig(Version, Analyzer);
WriterConfig.OpenMode = OpenMode.CREATE_OR_APPEND;
}
public string FieldName { get; }
public int MaxResults { get; set; }
public void Add(string term)
{
using (var writer = new IndexWriter(Directory, WriterConfig))
{
IndexDoc(writer, term);
}
}
public void AddRange(string[] terms)
{
using (var writer = new IndexWriter(Directory, WriterConfig))
{
foreach (string term in terms)
{
IndexDoc(writer, term);
}
}
}
public string[] WhereStartsWith(string term)
{
using (var reader = DirectoryReader.Open(Directory))
{
IndexSearcher searcher = new IndexSearcher(reader);
var query = new PrefixQuery(new Term(FieldName, term));
TopDocs foundDocs = searcher.Search(query, MaxResults);
var matches = foundDocs.ScoreDocs
.Select(scoreDoc => searcher.Doc(scoreDoc.Doc).Get(FieldName))
.ToArray();
return matches;
}
}
public void Dispose()
{
Directory.Dispose();
Analyzer.Dispose();
}
}
}
Running this:
var indexValues = new string[] { "apple fruit", "appricot", "ape", "avacado", "banana", "pear" };
var index = new LuceneAutoCompleteIndex("fn", 10);
index.AddRange(indexValues);
var matches = index.WhereStartsWith("app");
foreach (var match in matches)
{
Console.WriteLine(match);
}
You get this:
apple fruit
appricot

Consistent Lucene.NET runtime exception on certain queries

I'm putting together a proof of concept for Fulltext search in our application using Lucene.NET. Some queries work fine, some seem to return results that don't match what the Luke tool is returning. More problematically, this query:
(Description:tasty) (Gtin:00018389732061)
always yields this exception:
An unhandled exception of type 'System.IndexOutOfRangeException'
occurred in Lucene.Net.dll at Lucene.Net.Search.TermScorer.Score()
in d:\Lucene.Net\FullRepo\trunk\src\core\Search\TermScorer.cs:line 136
at
Lucene.Net.Search.BooleanScorer.BooleanScorerCollector.Collect(Int32
doc) in
d:\Lucene.Net\FullRepo\trunk\src\core\Search\BooleanScorer.cs:line 88
at Lucene.Net.Search.TermScorer.Score(Collector c, Int32 end, Int32
firstDocID) in
d:\Lucene.Net\FullRepo\trunk\src\core\Search\TermScorer.cs:line 80
at Lucene.Net.Search.BooleanScorer.Score(Collector collector, Int32
max, Int32 firstDocID) in
d:\Lucene.Net\FullRepo\trunk\src\core\Search\BooleanScorer.cs:line 323
at Lucene.Net.Search.BooleanScorer.Score(Collector collector) in
d:\Lucene.Net\FullRepo\trunk\src\core\Search\BooleanScorer.cs:line 389
at Lucene.Net.Search.IndexSearcher.Search(Weight weight, Filter
filter, Collector collector) in
d:\Lucene.Net\FullRepo\trunk\src\core\Search\IndexSearcher.cs:line 228
at Lucene.Net.Search.IndexSearcher.Search(Weight weight, Filter
filter, Int32 nDocs) in
d:\Lucene.Net\FullRepo\trunk\src\core\Search\IndexSearcher.cs:line 188
at Lucene.Net.Search.Searcher.Search(Query query, Filter filter, Int32
n) in d:\Lucene.Net\FullRepo\trunk\src\core\Search\Searcher.cs:line
108 at Lucene.Net.Search.Searcher.Search(Query query, Int32 n) in
d:\Lucene.Net\FullRepo\trunk\src\core\Search\Searcher.cs:line 118
at...
If I use this query instead:
(Description:tasty) (Gtin:000)
I get results back. What is causing the exception in the top query? FWIW, here is the relevant code snippet:
protected virtual IList<Document> GetDocuments(BooleanQuery query, DirectoryInfo indexLocation, string defaultField)
{
var docs = new List<Document>();
using (var dir = new MMapDirectory(indexLocation))
{
using (var searcher = new IndexSearcher(dir))
{
var queryParser = new QueryParser(Constants.LuceneVersion, defaultField, new StandardAnalyzer(Constants.LuceneVersion));
TopDocs result = searcher.Search(query, Constants.MaxHits);
if (result == null) return docs;
foreach (var scoredoc in result.ScoreDocs.OrderByDescending(d => d.Score))
{
docs.Add(searcher.Doc(scoredoc.Doc));
}
return docs;
}
}
}
Based on comments below, here is my current un-edited code that still doesn't work.
protected virtual IList<Document> GetDocuments(BooleanQuery query, DirectoryInfo indexLocation, string defaultField)
{
var docs = new List<Document>();
using (var dir = new MMapDirectory(indexLocation))
{
using (var searcher = new IndexSearcher(dir))
{
using (var analyzer = new StandardAnalyzer(Constants.LuceneVersion))
{
var queryParser = new QueryParser(Constants.LuceneVersion, defaultField, analyzer);
var collector = TopScoreDocCollector.Create(Constants.MaxHits, true);
var parsed = queryParser.Parse(query.ToString());
searcher.Search(parsed, collector);
var docsresult = new List<string>();
var matches = collector.TopDocs().ScoreDocs;
foreach (var scoredoc in matches.OrderByDescending(d => d.Score))
{
docs.Add(searcher.Doc(scoredoc.Doc));
}
return docs;
}
}
}
}
Not strictly an answer as it "works on my machine". Posting as an answer so that I can share the unit test code that "works". Hopefully the OP can show what is different with their version.
This version assumes that the "Gtin" field is a string field and is not analyzed (as it's seems to be a code).
[TestClass]
public class UnitTest4
{
[TestMethod]
public void TestLucene()
{
var writer = CreateIndex();
Add(writer, "tasty", "00018389732061");
writer.Flush(true, true, true);
var searcher = new IndexSearcher(writer.GetReader());
Test(searcher, "(Description:tasty) (Gtin:00018389732061)");
Test(searcher, "Description:tasty Gtin:00018389732061");
Test(searcher, "+Description:tasty +Gtin:00018389732061");
Test(searcher, "+Description:tasty +Gtin:000*");
writer.Dispose();
}
private void Test(IndexSearcher searcher, string query)
{
var result = Search(searcher, query);
Console.WriteLine(string.Join(", ", result));
Assert.AreEqual(1, result.Count);
Assert.AreEqual("00018389732061", result[0]);
}
private List<string> Search(IndexSearcher searcher, string expr)
{
using (var analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30))
{
var queryParser = new QueryParser(Lucene.Net.Util.Version.LUCENE_30, "Description", analyzer);
var collector = TopScoreDocCollector.Create(1000, true);
var query = queryParser.Parse(expr);
searcher.Search(query, collector);
var result = new List<string>();
var matches = collector.TopDocs().ScoreDocs;
foreach (var item in matches)
{
var id = item.Doc;
var doc = searcher.Doc(id);
result.Add(doc.GetField("Gtin").StringValue);
}
return result;
}
}
IndexWriter CreateIndex()
{
var directory = new RAMDirectory();
var analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30);
var writer = new IndexWriter(directory, analyzer, new IndexWriter.MaxFieldLength(1000));
return writer;
}
void Add(IndexWriter writer, string desc, string id)
{
var document = new Document();
document.Add(new Field("Description", desc, Field.Store.YES, Field.Index.ANALYZED));
document.Add(new Field("Gtin", id, Field.Store.YES, Field.Index.NOT_ANALYZED));
writer.AddDocument(document);
}
}

Lucene query not returning hit on standard analyzer

I have a filename thatfeelwhen.pdf that when I search for using words like "that" or "feel", I don't get a hit, when I do if I type "when" or the entire filename. I'm using a standard analyzer. How can I get the searcher for Lucene to match everything? My search queries seem to be matching on the content within the file but not in the filename.
public partial class _Default : Page
{
Directory finalDirectory = null;
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
Code below in other methods:
private static void AddTextToIndex(string filename, string pdfBody, IndexWriter writer)
{
Document doc = new Document();
doc.Add(new Field("fileName", filename.ToString(), Field.Store.YES, Field.Index.ANALYZED));
doc.Add(new Field("pdfBody", pdfBody.ToString(), Field.Store.NO, Field.Index.ANALYZED));
writer.AddDocument(doc);
}
private static Directory buildIndex(Analyzer analyzer)
{
string[] syllabusFiles = System.IO.Directory.GetFiles(#"C:\mywebsite\files\forms");
Directory directory = FSDirectory.Open(new DirectoryInfo(#"C:\mywebsite\files\LuceneIndex"));
var writer = new IndexWriter(directory, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
int j = 0;
while (j < syllabusFiles.Length)
{
string pdfTextExtracted = pdfText(syllabusFiles[j]);
string fileNameOnly = syllabusFiles[j].Replace("C:\\website\\files\\forms", "");
AddTextToIndex(fileNameOnly, pdfTextExtracted, writer);
j++;
}
writer.Optimize();
writer.Dispose();
return directory;
}
protected void txtBoxSearchPDF_Click(object sender, EventArgs e)
{
if (txtBoxSearchString.Text == "")
{
lblNoSearchString.Visible = true;
}
else if (txtBoxSearchString.Text == "build_index")
{
this.finalDirectory = buildIndex(this.analyzer);
}
else
{
//searching PDF text
lblNoSearchString.Visible = false;
StringBuilder sb = new StringBuilder();
this.finalDirectory = FSDirectory.Open(new DirectoryInfo(#"C:\mywebsite\files\LuceneIndex"));
IndexReader indexReader = IndexReader.Open(this.finalDirectory, true);
Searcher indexSearch = new IndexSearcher(indexReader);
string searchQuery = txtBoxSearchString.Text;
var fields = new[] { "fileName", "pdfBody" };
var queryParser = new MultiFieldQueryParser(Version.LUCENE_30, fields, this.analyzer);
Query query;
try
{
query = queryParser.Parse(searchQuery.Trim());
}
catch (ParseException)
{
query = queryParser.Parse(QueryParser.Escape(searchQuery.Trim()));
}
TopDocs resultDocs = indexSearch.Search(query, indexReader.MaxDoc);
var hits = resultDocs.ScoreDocs;
foreach (var hit in hits)
{
var documentFromSearcher = indexSearch.Doc(hit.Doc);
string getResult = documentFromSearcher.Get("fileName");
string formattedResult = getResult.Replace(" ", "%20");
sb.AppendLine(#"" + getResult+"");
sb.AppendLine("<br>");
}
I chose to use Analyzer analyzer = new SingleCharTokenAnalyzer(); and am getting much better results.
I tried Simple, Standard, Whitespace, and Keyword Analyzers and none were really suiting my needs without having to resort with creating extra work to customize them.

Boosting fields or documents has no effect in Lucene.Net

I am trying to get boosting to work, so I can boost docs and/or fields to make the search-result as I like it to be.
However, I am unable to make boosting docs or fields have ANY effect at all on the scoring.
Either Lucene.Net boosting does not work (not very likely) or I am misunderstanding something (very likely).
Here is my stripped down to bare essentials showcase code:
using System;
using System.Collections.Generic;
using Lucene.Net.Analysis;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.QueryParsers;
using Lucene.Net.Search;
namespace SO_LuceneTest
{
class Program
{
static void Main(string[] args)
{
const string INDEXNAME = "TextIndex";
var writer = new IndexWriter(INDEXNAME, new SimpleAnalyzer(), true);
writer.DeleteAll();
var persons = new Dictionary<string, string>
{
{ "Smithers", "Jansen" },
{ "Jan", "Smith" }
};
foreach (var p in persons)
{
var doc = new Document();
var firstnameField = new Field("Firstname", p.Key, Field.Store.YES, Field.Index.ANALYZED);
var lastnameField = new Field("Lastname", p.Value, Field.Store.YES, Field.Index.ANALYZED);
//firstnameField.SetBoost(2.0f);
doc.Add(firstnameField);
doc.Add(lastnameField);
writer.AddDocument(doc);
}
writer.Commit();
writer.Close();
var term = "jan*";
var queryFields = new string[] { "Firstname", "Lastname" };
var boosts = new Dictionary<string, float>();
//boosts.Add("Firstname", 10);
QueryParser mqp = new MultiFieldQueryParser(Lucene.Net.Util.Version.LUCENE_24, queryFields, new SimpleAnalyzer(), boosts);
var query = mqp.Parse(term);
IndexSearcher searcher = new IndexSearcher(INDEXNAME);
Hits hits = searcher.Search(query);
int results = hits.Length();
Console.WriteLine("Found {0} results", results);
for (int i = 0; i < results; i++)
{
Document doc = hits.Doc(i);
Console.WriteLine("{0} {1}\t\t{2}", doc.Get("Firstname"), doc.Get("Lastname"), hits.Score(i));
}
searcher.Close();
Console.WriteLine("...");
Console.Read();
}
}
}
I have commented out two instances of boosting. When included, the score is still the exact same as without the boosting.
What am I missing here?
I am using Lucene.Net v2.9.2.2, the latest version as of now.
please try if this will work, it does for me, but you have to modify it, because I have lots of other code which I won't be including in this post unless necessary. The main difference is use of topfieldcollector to get results
var dir = SimpleFSDirectory.Open(new DirectoryInfo(IndexPath));
var ixSearcher = new IndexSearcher(dir, false);
var qp = new QueryParser(Lucene.Net.Util.Version.LUCENE_29, f_Text, analyzer);
query = CleanQuery(query);
Query q = qp.Parse(query);
TopFieldCollector collector = TopFieldCollector.Create(
new Sort(new SortField(null, SortField.SCORE, false), new SortField(f_Date, SortField.LONG, true)),
MAX_RESULTS,
false, // fillFields - not needed, we want score and doc only
true, // trackDocScores - need doc and score fields
true, // trackMaxScore - related to trackDocScores
false); // should docs be in docId order?
ixSearcher.Search(q, collector);
TopDocs topDocs = collector.TopDocs();
ScoreDoc[] hits = topDocs.ScoreDocs;
uint pageCount = (uint)Math.Ceiling((double)hits.Length / pageSize);
for (uint i = pageIndex * pageSize; i < (pageIndex + 1) * pageSize; i++) {
if (i >= hits.Length) {
break;
}
int doc = hits[i].Doc;
Content c = new Content {
Title = ixSearcher.Doc(doc).GetField(f_Title).StringValue(),
Text = FragmentOnOrgText(ixSearcher.Doc(doc).GetField(f_TextOrg).StringValue(), highligter.GetBestFragments(analyzer, ixSearcher.Doc(doc).GetField(f_Text).StringValue(), maxNumberOfFragments)),
Date = DateTools.StringToDate(ixSearcher.Doc(doc).GetField(f_Date).StringValue()),
Score = hits[i].Score
};
rv.Add(c);
}
ixSearcher.Close();

Categories

Resources