I am really new to using HTMLAgilityPack. I have the following HTML document :
<a href="https://twitter.com/RedGiantNews" target="_blank"><img
src="http://image.e.redgiant.com/lib/998.png" width="24" border="0"
alt="Twitter" title="Twitter" class="smImage"></a><a
href="https://www.facebook.com/RedGiantSoftware" target="_blank"><img
src="http://image.e.redgiant.com/lib/db5.png" width="24" border="0"
alt="Facebook" title="Facebook" class="smImage"></a>
http://click.e.redgiant.com/?qs=d2ad061f
<a href="https://www.instagram.com/redgiantnews/" target="_blank"><img
src="http://image.e.redgiant.com/aa10-f8747e56f06d.png" width="24"
border="0" alt="Instagram" title="Instagram" class="smImage"></a>
I am trying to remove all images, i mean all nodes(if this is the right word) of <img....> from the html file. I tried the below code from another solution on StackOverflow but in vain as it returns the same HTMl as above :
var sb = new StringBuilder();
doc.LoadHtml(inputHTml);
foreach (var node in doc.DocumentNode.ChildNodes)
{
if (node.Name != "img" && node.Name!="a")
{
sb.Append(node.InnerHtml);
}
}
static string OutputHtml = #"<a href=""https://twitter.com/RedGiantNews"" target=""_blank""><img
src=""http://image.e.redgiant.com/lib/998.png"" width=""24"" border=""0""
alt=""Twitter"" title=""Twitter"" class=""smImage""></a><a
href = ""https://www.facebook.com/RedGiantSoftware"" target=""_blank""><img
src = ""http://image.e.redgiant.com/lib/db5.png"" width=""24"" border=""0""
alt=""Facebook"" title=""Facebook"" class=""smImage""></a>
<a href = ""https://www.instagram.com/redgiantnews/"" target=""_blank""><img
src = ""http://image.e.redgiant.com/aa10-f8747e56f06d.png"" width=""24""
border=""0"" alt=""Instagram"" title=""Instagram"" class=""smImage""></a>";
I removed the floating link (http://click.e.redgiant.com/?qs=d2ad061f) from the original html string.
Approach One:
public static string RemoveAllImageNodes(string html)
{
try
{
HtmlDocument document = new HtmlAgilityPack.HtmlDocument();
document.LoadHtml(html);
var nodes = document.DocumentNode.SelectNodes("//img");
foreach (var node in nodes)
{
node.Remove();
//node.Attributes.Remove("src"); //This only removes the src Attribute from <img> tag
}
html = document.DocumentNode.OuterHtml;
return html;
}
catch (Exception ex)
{
throw ex;
}
}
Approach Two:
public static string RemoveAllImageNodes(string html)
{
try
{
HtmlDocument document = new HtmlAgilityPack.HtmlDocument();
document.LoadHtml(html);
if (document.DocumentNode.InnerHtml.Contains("<img"))
{
foreach (var eachNode in document.DocumentNode.SelectNodes("//img"))
{
eachNode.Remove();
//eachNode.Attributes.Remove("src"); //This only removes the src Attribute from <img> tag
}
}
html = document.DocumentNode.OuterHtml;
return html;
}
catch (Exception ex)
{
throw ex;
}
}
OutPut Html:
Output Html - After removing only the "src" attributes from "img" tag(s):
<img width="24" border="0" alt="Twitter" title="Twitter" class="smImage">
<img width="24" border="0" alt="Facebook" title="Facebook" class="smImage">
<img width="24" border="0" alt="Instagram" title="Instagram" class="smImage">
Related
I have a xml file which looks like below.
<sections>
<section>
<name>QLD Mosques</name>
<stations>
<station>
<id>101</id>
<pn>true</pn>
<name>Kuraby Mosque</name>
<url>http://sb2110.ultrastream.co.uk/kuraby</url>
<icon>...</icon>
</station>
<station>
<id>102</id>
<pn>true</pn>
<name>Gold Coast Mosque</name>
<url>http://sb2110.ultrastream.co.uk/goldcoast</url>
<icon>http://www.juju.net.au/mosquereceivers/images/icons/gc.jpg</icon>
</station>
<station>...</station>
</stations>
</section>
<section>
<name>NZ Mosques</name>
<stations>...</stations>
</section>
<section>
<name>Islamic Radio Stations</name>
<stations>...</stations>
</section>
</sections>
I want get show all the station name which has "section" named "QLD Mosques".
For example my result will be "Kuraby Mosque,Gold Coast Mosque,...".
How can i achieve the result??
N:B:
I can show the names under "section" tag(which gives the result QLD Mosques,NZ Mosques,Islamic Radio Stations) by using these code:
public static List<MyData> channel_main_list = new List<MyData>();
public MainChannelList()
{
InitializeComponent();
WebClient client = new WebClient();
client.OpenReadCompleted += client_OpenReadCompleted;
client.OpenReadAsync(new Uri("http://www.juju.net.au/mosquereceivers/Stations.xml",UriKind.Absolute));
}
void client_OpenReadCompleted(object sender, OpenReadCompletedEventArgs e)
{
if (e.Error != null)
return;
Stream str = e.Result;
string node = "section";
XDocument loadedData = XDocument.Load(str);
try
{
foreach (var item in loadedData.Descendants(node))
{
try
{
MyData m = new MyData();
m.channel_name = item.Element("name").Value;
channel_main_list.Add(m);
}
catch (Exception)
{
MessageBox.Show("Problem");
}
}
listBox.ItemsSource = channel_main_list;
}
catch (Exception)
{
MessageBox.Show("Connectivity Problem");
}
}
This is one possible way, assuming that the XML structure is consistent and name being searched is always found :
var result = loadedData.Descendants("section")
.Where(o => (string)o.Element("name") == "QLD Mosques")
.Elements("stations")
.Elements("station")
.Elements("name");
Dotnetfiddle Demo
I want to get plain text in the body tag.
Markup:
**simple text 1**
<div>------</div>
<font>-------</font>
**simple text 2**
Code:
foreach (HtmlElement elm in webBrowser1.Document.Body.All)
{
//get simple text
}
Simply:
string plainText = webBrowser1.Document.Body.InnerText;
I find this easy way :
HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument();
htmlDoc.LoadHtml(webBrowser1.Document.Body.InnerHtml);
foreach (var elm in htmlDoc.DocumentNode.Descendants())
{
if (elm.NodeType == HtmlNodeType.Text)
{
//simple text is #text
var innerText=elm.InnerText;
}
}
have a good time.
Try following way: you can get all text which are shown in browser preview by using following technique.
string plainText= StripHTML(webBrowser1);// call this way-----
public string StripHTML(WebBrowser webp)
{
try
{
Clipboard.Clear();
webp.Document.ExecCommand("SelectAll", true, null);
webp.Document.ExecCommand("Copy", false, null);
}
catch (Exception ep)
{
MessageBox.Show(ep.Message);
}
return Clipboard.GetText();
}
How can I catch an email address from a website ?
I try to catch a email from a website that seems its protected by some JavaScript...
Here is the HTML code :
<p class="email">
<a href="mailto:info#aryanaz.ir" class="email">
info#aryanaz.ir
<script type="text/javascript">
/* <![CDATA[ */
(function(){try{var s,a,i,j,r,c,l,b=document.getElementsByTagName("script");l=b[b.length-1].previousSibling;a=l.getAttribute('data-cfemail');if(a){s='';r=parseInt(a.substr(0,2),16);for(j=2;a.length-j;j+=2){c=parseInt(a.substr(j,2),16)^r;s+=String.fromCharCode(c);}s=document.createTextNode(s);l.parentNode.replaceChild(s,l);}}catch(e){}})();
/* ]]> */
</script></a>
</p>
I use this code that catch protected value and it doesn't work :
HtmlAgilityPack.HtmlDocument doc = hw.Load(url);
var Email = from HtmlNode n in doc.DocumentNode.SelectNodes("//a[contains(#href, 'mailto:')]")
select n;
foreach (HtmlNode node in Email )
{
string email = node.InnerHtml.Trim();
if (node.InnerHtml.Trim() != "")
{
ClassBase.ENonQuery("addfullvalueemail ", System.Data.CommandType.StoredProcedure, new SqlParameter[]
{
new SqlParameter("#Email ",email ),
});
}
}
I'm working on a web scraper. The following text shows the results of the code given at the end of this question, which gets the values of all hrefs from a page.
I only want to get values that contain docid=
index.php?pageid=a45475a11ec72b843d74959b60fd7bd64556e8988583f
#
summary_of_documents.php
index.php?pageid=a45475a11ec72b843d74959b60fd7bd64579b861c1d7b
#
index.php?pageid=a45475a11ec72b843d74959b60fd7bd64579e0509c7f0&apform=judiciary
decisions.php?doctype=Decisions / Signed
Resolutions&docid=1263778435388003271#sam
decisions.php?doctype=Decisions / Signed
Resolutions&docid=12637789021669321156#sam
?doctype=Decisions / Signed Resolutions&year=1986&month=January#head
?doctype=Decisions / Signed Resolutions&year=1986&month=February#head
Here's the code:
string url = urlTextBox.Text;
string sourceCode = Extractor.getSourceCode(url);
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(sourceCode);
List<string> links = new List<string>();
if (links != null)
{
foreach (HtmlAgilityPack.HtmlNode nd in doc.DocumentNode.SelectNodes("//a[#href]"))
{
links.Add(nd.Attributes["href"].Value);
}
}
else
{
MessageBox.Show("No Links Found");
}
if (links != null)
{
foreach (string str in links)
{
richTextBox9.Text += str + "\n";
}
}
else
{
MessageBox.Show("No Link Values Found");
}
How can I do this?
Why not just replace this:
links.Add(nd.Attributes["href"].Value);
with this:
if (nd.Attributes["href"].Value.Contains("docid="))
links.Add(nd.Attributes["href"].Value);
I have some markup that contains certain HTML image tags with the class featured. What I need is to find all those images, add an anchor tag around the image, set the href attribute of the anchor to the images src value (the image path), and lastly replace the images src value with a new value (I call a method that will return this value).
<p>Some text here <img src="/my/path/image.png" alt="image description" class="featured" />. Some more text and another image that should not be modified <img src="/my/path/image2.png" alt="image description" /></p>
Should become.
<p>Some text here <img src="/new/path/from/method.png" alt="image description" class="featured" />. Some more text and another image that should not be modified <img src="/my/path/image2.png" alt="image description" /></p>
Don't use RegEx to parse HTML. See this classic SO answer for the reasons.
Use the HTML Agility Pack instead - you can use XPath to query your HTML.
Ended up with this code.
using System;
using System.Reflection;
using HtmlAgilityPack;
using log4net;
namespace Company.Web.Util
{
public static class HtmlParser
{
private static readonly ILog _log = LogManager.GetLogger(MethodBase.GetCurrentMethod().DeclaringType);
private static HtmlDocument _htmlDocument;
public static string Parse(string input)
{
_htmlDocument = new HtmlDocument();
_htmlDocument.LoadHtml(input);
ParseNode(_htmlDocument.DocumentNode);
return _htmlDocument.DocumentNode.WriteTo().Trim();
}
private static void ParseChildren(HtmlNode parentNode)
{
for (int i = parentNode.ChildNodes.Count - 1; i >= 0; i--)
{
ParseNode(parentNode.ChildNodes[i]);
}
}
private static void ParseNode(HtmlNode node)
{
if (node.NodeType == HtmlNodeType.Element)
{
if (node.Name == "img" && node.HasAttributes)
{
for (int i = node.Attributes.Count - 1; i >= 0; i--)
{
HtmlAttribute currentAttribute = node.Attributes[i];
if ("class" == currentAttribute.Name && currentAttribute.Value.ToLower().Contains("featured"))
{
try
{
string originaleImagePath = node.Attributes["src"].Value;
string imageThumbnailPath = GetImageThumbnail(originaleImagePath);
var anchorNode = HtmlNode.CreateNode("<a>");
var imageNode = HtmlNode.CreateNode("<img>");
imageNode.SetAttributeValue("alt", node.Attributes["alt"].Value);
imageNode.SetAttributeValue("src", imageThumbnailPath);
anchorNode.SetAttributeValue("href", originaleImagePath);
anchorNode.AppendChild(imageNode);
node.ParentNode.InsertBefore(anchorNode, node);
node.ParentNode.RemoveChild(node);
}
catch (Exception exception)
{
if (_log.IsDebugEnabled)
{
_log.WarnFormat("Some message: {0}", exception);
}
}
}
}
}
}
if (node.HasChildNodes)
{
ParseChildren(node);
}
}
}
}