C# HtmlAgilityPack : startIndex cannot be larger than length of string - c#

I'm trying to do something like this :
var document = htmlWeb.Load(searchUrl);
var hotels = document.DocumentNode.Descendants("div")
.Where(x => x.Attributes.Contains("class") &&
x.Attributes["class"].Value.Contains("listing-content"));
int count = 1;
foreach (var hotel in hotels)
{
HtmlDocument htmlDoc = new HtmlDocument();
htmlDoc.OptionFixNestedTags = true;
htmlDoc.Load(hotel.InnerText);
if (htmlDoc.DocumentNode != null)
{
var anchors = htmlDoc.DocumentNode.Descendants("div")
.Where(x => x.Attributes.Contains("class") &&
x.Attributes["class"].Value.Contains("srp-business-name")); // Error Occurring in here //
foreach (var anchor in anchors)
{
Console.WriteLine(anchor.InnerHtml);
}
}
}
I'm getting results like this :
New York Marriott Marquis
<span class="external-link">
<img height="15" src="/images/sprites/search/icon-link-external.png" width="16">
</span>
And
Courtyard by Marriott New York Manhattan/Times Square South
And so on.
Now I want the innerHtml of the anchors tags having class="url redbold mip-link". So I'm doing this :
var document = htmlWeb.Load(searchUrl);
var hotels = document.DocumentNode.Descendants("div")
.Where(x => x.Attributes.Contains("class") &&
x.Attributes["class"].Value.Contains("listing-content"));
int count = 1;
foreach (var hotel in hotels)
{
HtmlDocument htmlDoc = new HtmlDocument();
htmlDoc.OptionFixNestedTags = true;
htmlDoc.Load(hotel.InnerText);
if (htmlDoc.DocumentNode != null)
{
var anchors = htmlDoc.DocumentNode.Descendants("div")
.Where(x => x.Attributes.Contains("class") &&
x.Attributes["class"].Value.Contains("srp-business-name"));
foreach (var anchor in anchors)
{
htmlDoc.LoadHtml(anchor.InnerHtml);
var hoteltags = htmlDoc.DocumentNode.SelectNodes("//a");
foreach (var tag in hoteltags)
{
if (!string.IsNullOrEmpty(tag.InnerHtml) || !string.IsNullOrWhiteSpace(tag.InnerHtml))
{
Console.WriteLine(tag.InnerHtml);
}
}
}
}
}
I' getting the first result properly which is New York Marriott Marquis but in the second result an error occurring :
startIndex cannot be larger than length of string. What am I doing wrong ??

You are using the same DOM object for all your operations:
foreach (var hotel in hotels)
{
HtmlDocument htmlDoc = new HtmlDocument();
And after that you are using the same object for loading anchor tags:
foreach (var anchor in anchors)
{
htmlDoc.LoadHtml(anchor.InnerHtml);
Just change the document in the second iterator and it should work as expected.
foreach (var anchor in anchors)
{
var htmlDocAnchor= new HtmlDocument();
htmlDocAnchor.LoadHtml(anchor.InnerHtml);// And etc..

Related

How I remove unwanted pre and code tags of html content with HtmlAgilityPack [duplicate]

Im using the following method to extract text form html:
public string getAllText(string _html)
{
string _allText = "";
try
{
HtmlAgilityPack.HtmlDocument document = new HtmlAgilityPack.HtmlDocument();
document.LoadHtml(_html);
var root = document.DocumentNode;
var sb = new StringBuilder();
foreach (var node in root.DescendantNodesAndSelf())
{
if (!node.HasChildNodes)
{
string text = node.InnerText;
if (!string.IsNullOrEmpty(text))
sb.AppendLine(text.Trim());
}
}
_allText = sb.ToString();
}
catch (Exception)
{
}
_allText = System.Web.HttpUtility.HtmlDecode(_allText);
return _allText;
}
Problem is that i also get script and style tags.
How could i exclude them?
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(html);
doc.DocumentNode.Descendants()
.Where(n => n.Name == "script" || n.Name == "style")
.ToList()
.ForEach(n => n.Remove());
You can do so using HtmlDocument class:
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(input);
doc.DocumentNode.SelectNodes("//style|//script").ToList().ForEach(n => n.Remove());
Some excellent answers, System.Linq is handy!
For a non Linq based approach:
private HtmlAgilityPack.HtmlDocument RemoveScripts(HtmlAgilityPack.HtmlDocument webDocument)
{
// Get all Nodes: script
HtmlAgilityPack.HtmlNodeCollection Nodes = webDocument.DocumentNode.SelectNodes("//script");
// Make sure not Null:
if (Nodes == null)
return webDocument;
// Remove all Nodes:
foreach (HtmlNode node in Nodes)
node.Remove();
return webDocument;
}
public static string StripStyles(this string html)
{
var document = new HtmlDocument();
document.LoadHtml(html);
foreach (var node in document.DocumentNode.DescendantsAndSelf())
{
var toRemove = node.Attributes.Where(x => x.Name == "style" || x.Name == "script")
.ToList();
foreach (var attribute in toRemove)
{
attribute.Remove();
}
}
return document.DocumentNode.OuterHtml;
}

How to get a text between nodes

I have a problem with extracting text between nodes.It shows me the entire span node.I would like to get value of hours e.g 4:45;5:15 e.t.c.
var html = #"https://programtv.onet.pl/";
HtmlWeb web = new HtmlWeb();
var htmldoc=web.Load(html);
var findhours = htmldoc.DocumentNode.SelectNodes("//div[#id='boxTV1']//div[#class='hours']//span[#class='hour']");
if (findhours != null)
{
foreach (var x in findhours )
{
Console.WriteLine(x.OuterHtml);
}
}
else
{
Console.WriteLine("node = null");
}
Console.ReadLine();
Application window
You can simply use the InnerText property of your HtmlNode object. Checkout the following documentation.
foreach (var x in findhours )
{
Console.WriteLine(x.InnerText);
}

AngleSharp return span id

I'm trying to get a grip on AngleSharp by returning a specific part of an html.
So far that's my Code:
using (WebClient client = new WebClient())
{
string htmlCode = client.DownloadString("http://www.planetradio.de/music/trackfinder.html");
var parser = new HtmlParser();
var document = parser.Parse(htmlCode);
var blueListItemsLinq = document.All.Where(m => m.LocalName == "span id" && m.ClassList.Contains("headerTracklistCurrentSongArtist"));
foreach (var item in blueListItemsLinq)
label.Content = item;
}
What i want it to return is the current Artist which should be in the html under:
<div id="headerTracklistCurrentSong">
<span id="headerTracklistCurrentSongArtist">Olly Murs</span>
<span id="headerTracklistCurrentSongTitle">Kiss Me</span>
But i seem to have made a mistake....so i'd be glad if someone could help me here and explain it to me...
Thanks in advance to everyone answering. :)
The span elements are parsed as AngleSharp.Dom.Html.HtmlSpanElement
so your query should be:
var blueListItemsLinq = document.All.Where(m => m.LocalName == "span" && m.Id == "headerTracklistCurrentSongArtist");
Then you can get the text/values like this:
foreach (var item in blueListItemsLinq)
{
label.Content = item.TextContent; // "Olly Murs"
var child = item.FirstChild as AngleSharp.Dom.Html.IHtmlAnchorElement;
var text = child.Text; // "Olly Murs"
var path = child.PathName; // "/music/trackfinder.html"
}
UPDATED
Since the currnt Artist names are shown in the table at "planetradio.de/music/trackfinder.html", you can get the names like this:
var hitfinderTable = document.All.Where(m => m.Id == "hitfindertable").First() as AngleSharp.Dom.Html.IHtmlTableElement;
foreach (var row in hitfinderTable.Rows)
{
var artistName = row.Cells[2].TextContent;
}

How to extract different divs between two anchor tags in htmlagility?

<html>
<A NAME="doc_id_1"></A>
<div class="find1">
Iam here, extract me.
</div>
<div class = "find2">
iam here also, extract me as well.
</div>
<A NAME="doc_id_2"></A>
</html>
I have used below code to extract data :
var nodes = doc.DocumentNode.SelectNodes("//a[#name = 'doc_id_1']");
var nodes1 = doc.DocumentNode.SelectNodes("//a[#name = 'doc_id_2']");
foreach (HtmlNode node in nodes)
{
string yourText1 = node.InnerText;
//var yourText2 = node.NextSibling.SelectNodes("//div");
string yourText2 = node.NextSibling.InnerHtml;
//foreach (HtmlNode var in yourText2)
//{
// string yourText3 = var.InnerHtml;
//}
}
I don't want to give class name on those div. Because i am writing a generic code.Any help will be appreciated.
One option, using Linq:
var doc = new HtmlDocument();
doc.LoadHtml(html: Resources.Html);
var startNode = doc.DocumentNode.SelectSingleNode("//a[#name = 'doc_id_1']");
var endNode = doc.DocumentNode.SelectSingleNode("//a[#name = 'doc_id_2']");
var parent = startNode.ParentNode;
var nodesYouWant = parent.ChildNodes
.SkipWhile(node => node != startNode) // skip all nodes up to the start node
.Skip(1) // skip the start node
.TakeWhile(node => node != endNode) // take all nodes up to the next anchor
.Where(node => node.Name == "div"); // select only div nodes
Or:
var currentNode = doc.DocumentNode.SelectSingleNode("//a[#name = 'doc_id_1']");
var endNode = doc.DocumentNode.SelectSingleNode("//a[#name = 'doc_id_2']");
var nodesYouWant = GetEnclosedNodes(currentNode, endNode).Where(node => node.Name == "div");
private static IEnumerable<HtmlNode> GetEnclosedNodes(HtmlNode currentNode, HtmlNode endNode)
{
currentNode = currentNode.NextSibling;
while (currentNode != null && currentNode != endNode)
{
yield return currentNode;
currentNode = currentNode.NextSibling;
}
}
I'm assuming you will know the name value of the two anchor tags.
var doc = new HtmlDocument();
var firstAnchor = doc.DocumentNode.SelectSingleNode("//a[#name = 'doc_id_1']");
var div = firstAnchor.NextSibling;
while (div.Name != "doc_id_2") //when the name of the second anchor is found we have no more divs
{
var divText = div.InnerText; //do whatever with this
div = div.NextSibling;
}

htmlagilitypack - remove script and style?

Im using the following method to extract text form html:
public string getAllText(string _html)
{
string _allText = "";
try
{
HtmlAgilityPack.HtmlDocument document = new HtmlAgilityPack.HtmlDocument();
document.LoadHtml(_html);
var root = document.DocumentNode;
var sb = new StringBuilder();
foreach (var node in root.DescendantNodesAndSelf())
{
if (!node.HasChildNodes)
{
string text = node.InnerText;
if (!string.IsNullOrEmpty(text))
sb.AppendLine(text.Trim());
}
}
_allText = sb.ToString();
}
catch (Exception)
{
}
_allText = System.Web.HttpUtility.HtmlDecode(_allText);
return _allText;
}
Problem is that i also get script and style tags.
How could i exclude them?
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(html);
doc.DocumentNode.Descendants()
.Where(n => n.Name == "script" || n.Name == "style")
.ToList()
.ForEach(n => n.Remove());
You can do so using HtmlDocument class:
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(input);
doc.DocumentNode.SelectNodes("//style|//script").ToList().ForEach(n => n.Remove());
Some excellent answers, System.Linq is handy!
For a non Linq based approach:
private HtmlAgilityPack.HtmlDocument RemoveScripts(HtmlAgilityPack.HtmlDocument webDocument)
{
// Get all Nodes: script
HtmlAgilityPack.HtmlNodeCollection Nodes = webDocument.DocumentNode.SelectNodes("//script");
// Make sure not Null:
if (Nodes == null)
return webDocument;
// Remove all Nodes:
foreach (HtmlNode node in Nodes)
node.Remove();
return webDocument;
}
public static string StripStyles(this string html)
{
var document = new HtmlDocument();
document.LoadHtml(html);
foreach (var node in document.DocumentNode.DescendantsAndSelf())
{
var toRemove = node.Attributes.Where(x => x.Name == "style" || x.Name == "script")
.ToList();
foreach (var attribute in toRemove)
{
attribute.Remove();
}
}
return document.DocumentNode.OuterHtml;
}

Categories

Resources