I am attempting to implement the Levenshtein Distance algorithm in C# (for practice and because it'd be handy to have). I used an implementation from the Wikipedia page but for some reason I'm getting the wrong distance on one set of words. Here's the code (from LinqPad):
void Main()
{
var ld = new LevenshteinDistance();
int dist = ld.LevenshteinDistanceCalc("sitting","kitten");
dist.Dump();
}
// Define other methods and classes here
public class LevenshteinDistance
{
private int[,] distance;
public int LevenshteinDistanceCalc(string source, string target)
{
int sourceSize = source.Length, targetSize = target.Length;
distance = new int[sourceSize, targetSize];
for (int sIndex = 0; sIndex < sourceSize; sIndex++)
{
distance[sIndex, 0] = sIndex;
}
for (int tIndex = 0; tIndex < targetSize; tIndex++)
{
distance[0,tIndex] = tIndex;
}
// for j from 1 to n:
// for i from 1 to m:
// if s[i] = t[j]:
// substitutionCost:= 0
// else:
// substitutionCost:= 1
// d[i, j] := minimum(d[i - 1, j] + 1, // deletion
// d[i, j - 1] + 1, // insertion
// d[i - 1, j - 1] + substitutionCost) // substitution
//
//
// return d[m, n]
for (int tIndex = 1; tIndex < targetSize; tIndex++)
{
for (int sIndex = 1; sIndex < sourceSize; sIndex++)
{
int substitutionCost = source[sIndex] == target[tIndex] ? 0 : 1;
int deletion = distance[sIndex-1, tIndex]+1;
int insertion = distance[sIndex,tIndex-1]+1;
int substitution = distance[sIndex-1, tIndex-1] + substitutionCost;
distance[sIndex, tIndex] = leastOfThree(deletion, insertion, substitution);
}
}
return distance[sourceSize-1,targetSize-1];
}
private int leastOfThree(int a, int b, int c)
{
return Math.Min(a,(Math.Min(b,c)));
}
}
When I try "sitting" and "kitten" I get an LD of 2 (should be 3). Yet when I try "Saturday" and "Sunday" I get an LD of 3 (which is correct). I know something's wrong but I can't figure out what I'm missing.
The example on wikipedia uses 1-based strings. In C# we use 0-based strings.
In their matrix the 0-row and 0-column does exist. So the size of their matrix is [source.Length + 1, source.Length + 1] In your code it doesn't exist.
public int LevenshteinDistanceCalc(string source, string target)
{
int sourceSize = source.Length, targetSize = target.Length;
distance = new int[sourceSize + 1, targetSize + 1];
for (int sIndex = 1; sIndex <= sourceSize; sIndex++)
distance[sIndex, 0] = sIndex;
for (int tIndex = 1; tIndex <= targetSize; tIndex++)
distance[0, tIndex] = tIndex;
for (int tIndex = 1; tIndex <= targetSize; tIndex++)
{
for (int sIndex = 1; sIndex <= sourceSize; sIndex++)
{
int substitutionCost = source[sIndex-1] == target[tIndex-1] ? 0 : 1;
int deletion = distance[sIndex - 1, tIndex] + 1;
int insertion = distance[sIndex, tIndex - 1] + 1;
int substitution = distance[sIndex - 1, tIndex - 1] + substitutionCost;
distance[sIndex, tIndex] = leastOfThree(deletion, insertion, substitution);
}
}
return distance[sourceSize, targetSize];
}
Your matrix isn't big enough.
In the pseudo-code, s and t have lengths m and n respectively (char s[1..m], char t[1..n]). The matrix however has dimentions [0..m, 0..n] - i.e. one more than the length of the strings in each direction. You can see this in the tables below the pseudo-code.
So the matrix for "sitting" and "kitten" is 7x8, but your matrix is only 6x7.
You're also indexing into the strings incorrectly, because the strings in the pseudo-code are 1-indexed, but C#'s strings are 0-indexed.
After fixing these, you get this code, which works with "sitting" and "kitten":
public static class LevenshteinDistance
{
public static int LevenshteinDistanceCalc(string source, string target)
{
int sourceSize = source.Length + 1, targetSize = target.Length + 1;
int[,] distance = new int[sourceSize, targetSize];
for (int sIndex = 0; sIndex < sourceSize; sIndex++)
{
distance[sIndex, 0] = sIndex;
}
for (int tIndex = 0; tIndex < targetSize; tIndex++)
{
distance[0, tIndex] = tIndex;
}
// for j from 1 to n:
// for i from 1 to m:
// if s[i] = t[j]:
// substitutionCost:= 0
// else:
// substitutionCost:= 1
// d[i, j] := minimum(d[i - 1, j] + 1, // deletion
// d[i, j - 1] + 1, // insertion
// d[i - 1, j - 1] + substitutionCost) // substitution
//
//
// return d[m, n]
for (int tIndex = 1; tIndex < targetSize; tIndex++)
{
for (int sIndex = 1; sIndex < sourceSize; sIndex++)
{
int substitutionCost = source[sIndex - 1] == target[tIndex - 1] ? 0 : 1;
int deletion = distance[sIndex - 1, tIndex] + 1;
int insertion = distance[sIndex, tIndex - 1] + 1;
int substitution = distance[sIndex - 1, tIndex - 1] + substitutionCost;
distance[sIndex, tIndex] = leastOfThree(deletion, insertion, substitution);
}
}
return distance[sourceSize - 1, targetSize - 1];
}
private static int leastOfThree(int a, int b, int c)
{
return Math.Min(a, (Math.Min(b, c)));
}
}
(I also took the liberty of making distance a local variable since there's no need for it to be a field (it only makes your class non-threadsafe), and also making it static to avoid the unnecessary instantiation).
To debug this, I put a breakpoint on return distance[sourceSize - 1, targetSize - 1] and compared distance to the table on Wikipedia. It was very obvious that it was too small.
Related
Doing a Binary Insertion Sort and my recursion is backfiring on me when I return my array. When I use the array : { 3, 1, 2, 4 } I get back 1,2,3,4.
When I use the array : { 3, 7, 2, 4 } the recursion causes a StackOverflow.
Where am I going wrong?
p.s Sorry about the console.writelines in my code it helps me check whats going on as I develop
The C# code is per the following snippet:
int[] a = new int[] { 3, 1, 2, 4 }; //Array to be sorted
int MiddlePointer = 0;
int LeftPointer = 0;
int RightPointer = 0;
int i = 1; //First number is 'sorted' so focus on second number
BinaryInsertSort(a, MiddlePointer, LeftPointer, RightPointer, i);
void BinaryInsertSort(int[] a, int MiddlePointer, int LeftPointer, int RightPointer, int i)
{
if (i == a.Length) //This should EXIT the algorithm once all of the numbers are sorted
{
return;
}
if (MiddlePointer == 0 & LeftPointer == 0 & RightPointer == 0) //If this is the first iteration, only the first number is 'sorted' and all of the pointers are the same
{
if (a[i] > a[MiddlePointer]) //If the next number is higher then just raise the RightPointer
{
RightPointer = i;
}
else //If the next number is lower, the 'sorted' values need to be 'shifted' one place to the right
{
RightPointer = i;
int temp = a[i];
for (int j = RightPointer; j > LeftPointer; j--)
{
a[j] = a[j - 1];
}
a[LeftPointer] = temp;
}
// i++; //At this point one number has been sorted
}
else
{
a = Testing(a, MiddlePointer, LeftPointer, RightPointer, i);
}
foreach (int x in a)
{
Console.WriteLine(x);
}
Console.ReadLine();
i++;
BinaryInsertSort(a, MiddlePointer, LeftPointer, RightPointer, i);
}
int[] Testing(int[] a, int MiddlePointer, int LeftPointer, int RightPointer,int i) //This method should find the space where the number should be inserted and return the updated array
{
if(MiddlePointer == RightPointer & RightPointer == LeftPointer)
{
Console.WriteLine($"{a[i]} has not been found");
if (a[i] > a[MiddlePointer])
{
RightPointer = i;
}
else
{
RightPointer = i;
int temp = a[i];
for (int j = RightPointer; j > 0; j--)//move up values
{
a[j] = a[j - 1];
}
a[LeftPointer] = temp;
}
}
else if (a[i] > a[MiddlePointer])
{
Console.WriteLine($"{a[i]} is greater than {a[MiddlePointer]}");
LeftPointer = MiddlePointer + 1;
MiddlePointer = (LeftPointer + RightPointer) / 2;
Testing(a, MiddlePointer, LeftPointer, RightPointer, i);
}
else if (a[i] < a[MiddlePointer])
{
Console.WriteLine($"{a[i]} is less than {a[MiddlePointer]}");
RightPointer = MiddlePointer - 1;
MiddlePointer = (LeftPointer + RightPointer) / 2;
Testing(a, MiddlePointer, LeftPointer, RightPointer, i);
}
return a;
}
Basically I went back to scratch and did a binary search first. This really helped in my previous attempt so I really made sure this worked before moving onto binary insertion sort. From this point I modified the code bit by bit and tried loads of different data sets to see if anything changed. Using breakpoints really helped. I needed to go through and 'follow' the changing variables to see where errors were being made. I also used Console.ReadLine and Console.WriteLine to follow where my logic was ending up. There are many loops in this code and the biggest problem was when I was using the recursion. I was using the recursion to find the 'gap' were the number should be inserted, this was the moment all of the pointers were the same. These pointers needed to constantly be updated and changed at the right moments. This was the most challenging part of the sort.
int i = 1;
int MiddlePointer = 0;
int LeftPointer = 0;
int RightPointer = 0;
BinaryInsertionSort.SortList(UnsortedNumberList, MiddlePointer, LeftPointer, RightPointer, i);
public void SortList(int[] a, int MiddlePointer, int LeftPointer, int RightPointer, int i)
{
if (i == a.Length)
{
Console.Write("Sorted list: ");
for (int x = 0; x < a.Length; x++) //output sorted list
{
if (x == a.Length - 1)
{
Console.Write($"{a[x]}");
}
else
{
Console.Write($"{a[x]}, ");
}
}
return;
}
if (a[MiddlePointer] == a[i])
{
RightPointer = i;
int temp = a[i];
for (int j = i; j > MiddlePointer + 1; j--)
{
a[j] = a[j - 1];
}
a[MiddlePointer + 1] = temp;
LeftPointer = 0;
MiddlePointer = i / 2;
i++;
SortList(a, MiddlePointer, LeftPointer, RightPointer, i);
}
else if (MiddlePointer == RightPointer & RightPointer == LeftPointer)
{
if (a[i] > a[MiddlePointer])
{
RightPointer = i;
int temp = a[i];
for (int j = i; j > MiddlePointer + 1; j--)
{
a[j] = a[j - 1];
}
a[MiddlePointer + 1] = temp;
}
else //If the next number is lower, the 'sorted' values need to be 'shifted' one place to the right
{
RightPointer = i;
int temp = a[i];
for (int j = i; j > MiddlePointer; j--)
{
a[j] = a[j - 1];
}
a[MiddlePointer] = temp;
}
LeftPointer = 0;
MiddlePointer = i / 2;
i++;
SortList(a, MiddlePointer, LeftPointer, RightPointer, i);
}
else if (a[i] > a[MiddlePointer])
{
LeftPointer = MiddlePointer + 1;
if (LeftPointer > RightPointer)
{
LeftPointer = RightPointer;
}
MiddlePointer = (LeftPointer + RightPointer) / 2;
SortList(a, MiddlePointer, LeftPointer, RightPointer, i);
}
else if (a[i] < a[MiddlePointer])
{
RightPointer = MiddlePointer - 1;
if (RightPointer < 0)
{
RightPointer = 0;
}
MiddlePointer = (LeftPointer + RightPointer) / 2;
SortList(a, MiddlePointer, LeftPointer, RightPointer, i);
}
}
}
UnsortedNumberList is an array of numbers. At first the pointers point at the first number at Array position 0. When the pointers are equal to each other then the position where the number is inserted has been found. From this point evaluate if the next number is higher or lower that this optimum position. Numbers will have to be juggled about hence the for loops with int j and variable temp. I hope this helps anyone else doing merge sort in the future.
I want to allow users to input their own characters to find the lowest common subsequence, this code is mostly a skeleton of somebody else's work- and the characters used are AGGTAB and GXTXYAB which is GTAB however I want users to implement their own characters
code for longest common subsequence:
class GFG
{
public static void Main()
{
int m = X.Length;
int n = Y.Length;
Console.WriteLine("Write down characters for string A");
string A = Console.ReadLine();
Console.WriteLine("Write down characters for string B");
string B = Console.ReadLine();
int[,] L = new int[m + 1, n + 1];
for (int i = 0; i <= m; i++)
{
for (int j = 0; j <= n; j++)
{
if (i == 0 || j == 0)
L[i, j] = 0;
else if (X[i - 1] == Y[j - 1])
L[i, j] = L[i - 1, j - 1] + 1;
else
L[i, j] = Math.Max(L[i - 1, j], L[i, j - 1]);
}
}
}
}
I am trying to implement a step counter into my bubble sort algorithm, but I don't know how to display the counter at the end of the sorting algorithm. If anyone could explain how I should go about this that would be great. Thank You.
My current code: (Bubble Sort):
static int[] bubbleSort(int[] arr, int n)
{
int stepCount = 0; // <- Counter to return and display
for (int i = 0; i < n - 1; i++)
{
for (int j = 0; j < n - 1 - i; j++)
{
if (arr[j + 1] < arr[j])
{
int temp = arr[j];
arr[j] = arr[j + 1];
arr[j + 1] = temp;
}
stepCount++;
}
}
return arr;
}
public static void DisplayArrayBubble(int[] arr)
{
foreach (int i in arr)
{
Console.Write(i.ToString() + " ");
}
}
Why not just return int - number of steps? I.e.
// arr : will be sorted
// return : number of steps
static int bubbleSort(int[] arr) {
if (null == arr)
return 0;
int stepCount = 0;
for (int i = 0; i < arr.Length - 1; i++)
for (int j = 0; j < arr.Length - 1 - i; j++)
if (arr[j + 1] < arr[j]) {
int temp = arr[j];
arr[j] = arr[j + 1];
arr[j + 1] = temp;
stepCount += 1;
}
return stepCount;
}
Demo:
int[] sample = new int[] {1, 5, 4, 3, 2, 7};
int steps = bubbleSort(sample);
Console.WriteLine($"Sorted [{string.Join(", ", sample)}] in {steps} steps");
Outcome:
Sorted [1, 2, 3, 4, 5, 7] in 6 steps
There a plethora of ways but one is to make a custom class to hold both pieces of information that you need:
public class BubbleObject
{
public int[] arr { get; set; }
public int stepCount { get; set; }
}
Then adjust the code you have to use that object:
static BubbleObject bubbleSort(int[] arr, int n)
{
int stepCount = 0;
for (int i = 0; i < n - 1; i++)
{
for (int j = 0; j < n - 1 - i; j++)
{
if (arr[j + 1] < arr[j])
{
int temp = arr[j];
arr[j] = arr[j + 1];
arr[j + 1] = temp;
}
stepCount++;
}
}
BubbleObject bo = new BubbleObject() { arr=arr, stepCount=stepCount}
return bo;
}
public static void DisplayArrayBubble(BubbleObject bo)
{
Console.WriteLine("Number of Steps = " + bo.stepCount);
foreach (int i in bo.arr)
{
Console.Write(i.ToString() + " ");
}
}
That should do it. There are other ways as well.
Hi i'm using the levenshtein algorithm to calculate the difference between two strings, using the below code. It currently provides the total number of changes which need to be made to get from 'answer' to 'target', but i'd like to split these up into the types of errors being made. So classifying an error as a deletion, substitution or insertion.
I've tried adding a simple count but i'm new at this and don't really understand how the code works so not sure how to go about it.
static class LevenshteinDistance
{
/// <summary>
/// Compute the distance between two strings.
/// </summary>
public static int Compute(string s, string t)
{
int n = s.Length;
int m = t.Length;
int[,] d = new int[n + 1, m + 1];
// Step 1
if (n == 0)
{
return m;
}
if (m == 0)
{
return n;
}
// Step 2
for (int i = 0; i <= n; d[i, 0] = i++)
{
}
for (int j = 0; j <= m; d[0, j] = j++)
{
}
// Step 3
for (int i = 1; i <= n; i++)
{
//Step 4
for (int j = 1; j <= m; j++)
{
// Step 5
int cost = (t[j - 1] == s[i - 1]) ? 0 : 1;
// Step 6
d[i, j] = Math.Min(
Math.Min(d[i - 1, j] + 1, d[i, j - 1] + 1),
d[i - 1, j - 1] + cost);
}
}
// Step 7
return d[n, m];
}
}
Thanks in advance.
I have the following implementation, but I want to add a threshold, so if the result is going to be greater than it, just stop calculating and return.
How would I go about that?
EDIT: Here is my current code, threshold is not yet used...the goal is that it is used
public static int DamerauLevenshteinDistance(string string1, string string2, int threshold)
{
// Return trivial case - where they are equal
if (string1.Equals(string2))
return 0;
// Return trivial case - where one is empty
if (String.IsNullOrEmpty(string1) || String.IsNullOrEmpty(string2))
return (string1 ?? "").Length + (string2 ?? "").Length;
// Ensure string2 (inner cycle) is longer
if (string1.Length > string2.Length)
{
var tmp = string1;
string1 = string2;
string2 = tmp;
}
// Return trivial case - where string1 is contained within string2
if (string2.Contains(string1))
return string2.Length - string1.Length;
var length1 = string1.Length;
var length2 = string2.Length;
var d = new int[length1 + 1, length2 + 1];
for (var i = 0; i <= d.GetUpperBound(0); i++)
d[i, 0] = i;
for (var i = 0; i <= d.GetUpperBound(1); i++)
d[0, i] = i;
for (var i = 1; i <= d.GetUpperBound(0); i++)
{
for (var j = 1; j <= d.GetUpperBound(1); j++)
{
var cost = string1[i - 1] == string2[j - 1] ? 0 : 1;
var del = d[i - 1, j] + 1;
var ins = d[i, j - 1] + 1;
var sub = d[i - 1, j - 1] + cost;
d[i, j] = Math.Min(del, Math.Min(ins, sub));
if (i > 1 && j > 1 && string1[i - 1] == string2[j - 2] && string1[i - 2] == string2[j - 1])
d[i, j] = Math.Min(d[i, j], d[i - 2, j - 2] + cost);
}
}
return d[d.GetUpperBound(0), d.GetUpperBound(1)];
}
}
This is Regarding ur answer this: Damerau - Levenshtein Distance, adding a threshold
(sorry can't comment as I don't have 50 rep yet)
I think you have made an error here. You initialized:
var minDistance = threshold;
And ur update rule is:
if (d[i, j] < minDistance)
minDistance = d[i, j];
Also, ur early exit criteria is:
if (minDistance > threshold)
return int.MaxValue;
Now, observe that the if condition above will never hold true! You should rather initialize minDistance to int.MaxValue
Here's the most elegant way I can think of. After setting each index of d, see if it exceeds your threshold. The evaluation is constant-time, so it's a drop in the bucket compared to the theoretical N^2 complexity of the overall algorithm:
public static int DamerauLevenshteinDistance(string string1, string string2, int threshold)
{
...
for (var i = 1; i <= d.GetUpperBound(0); i++)
{
for (var j = 1; j <= d.GetUpperBound(1); j++)
{
...
var temp = d[i,j] = Math.Min(del, Math.Min(ins, sub));
if (i > 1 && j > 1 && string1[i - 1] == string2[j - 2] && string1[i - 2] == string2[j - 1])
temp = d[i,j] = Math.Min(temp, d[i - 2, j - 2] + cost);
//Does this value exceed your threshold? if so, get out now
if(temp > threshold)
return temp;
}
}
return d[d.GetUpperBound(0), d.GetUpperBound(1)];
}
You also asked this as a SQL CLR UDF question so I'll answer in that specific context: you best optmiziation won't come from optimizing the Levenshtein distance, but from reducing the number of pairs you compare. Yes, a faster Levenshtein algorithm will improve things, but not nearly as much as reducing the number of comparisons from N square (with N in the millions of rows) to N*some factor. My proposal is to compare only elements who have the length difference within a tolerable delta. On your big table, you add a persisted computed column on LEN(Data) and then create an index on it with include Data:
ALTER TABLE Table ADD LenData AS LEN(Data) PERSISTED;
CREATE INDEX ndxTableLenData on Table(LenData) INCLUDE (Data);
Now you can restrict the sheer problem space by joining within an max difference on lenght (eg. say 5), if your data's LEN(Data) varies significantly:
SELECT a.Data, b.Data, dbo.Levenshtein(a.Data, b.Data)
FROM Table A
JOIN Table B ON B.DataLen BETWEEN A.DataLen - 5 AND A.DataLen+5
Finally got it...though it's not as beneficial as I had hoped
public static int DamerauLevenshteinDistance(string string1, string string2, int threshold)
{
// Return trivial case - where they are equal
if (string1.Equals(string2))
return 0;
// Return trivial case - where one is empty
if (String.IsNullOrEmpty(string1) || String.IsNullOrEmpty(string2))
return (string1 ?? "").Length + (string2 ?? "").Length;
// Ensure string2 (inner cycle) is longer
if (string1.Length > string2.Length)
{
var tmp = string1;
string1 = string2;
string2 = tmp;
}
// Return trivial case - where string1 is contained within string2
if (string2.Contains(string1))
return string2.Length - string1.Length;
var length1 = string1.Length;
var length2 = string2.Length;
var d = new int[length1 + 1, length2 + 1];
for (var i = 0; i <= d.GetUpperBound(0); i++)
d[i, 0] = i;
for (var i = 0; i <= d.GetUpperBound(1); i++)
d[0, i] = i;
for (var i = 1; i <= d.GetUpperBound(0); i++)
{
var im1 = i - 1;
var im2 = i - 2;
var minDistance = threshold;
for (var j = 1; j <= d.GetUpperBound(1); j++)
{
var jm1 = j - 1;
var jm2 = j - 2;
var cost = string1[im1] == string2[jm1] ? 0 : 1;
var del = d[im1, j] + 1;
var ins = d[i, jm1] + 1;
var sub = d[im1, jm1] + cost;
//Math.Min is slower than native code
//d[i, j] = Math.Min(del, Math.Min(ins, sub));
d[i, j] = del <= ins && del <= sub ? del : ins <= sub ? ins : sub;
if (i > 1 && j > 1 && string1[im1] == string2[jm2] && string1[im2] == string2[jm1])
d[i, j] = Math.Min(d[i, j], d[im2, jm2] + cost);
if (d[i, j] < minDistance)
minDistance = d[i, j];
}
if (minDistance > threshold)
return int.MaxValue;
}
return d[d.GetUpperBound(0), d.GetUpperBound(1)] > threshold
? int.MaxValue
: d[d.GetUpperBound(0), d.GetUpperBound(1)];
}