I am trying to parse some XML, however, I get the error above this sentence.
Here is my code:
class Program
{
static void Main(string[] args)
{
string xml = #"<?xml version=""1.0"" encoding=""UTF-8""?><rss version=""2.0"" xmlns:georss=""http://www.georss.org/georss"">
<channel>
<title>asp.net Jobs in Boston, MA | Indeed.com</title>
<link>http://www.indeed.com/q-asp.net-l-boston,-jobs.html</link>
<description>Indeed.com - one search. all jobs. Search thousands of sites for asp.net Jobs in Boston, MA</description>
<language>en</language>
<copyright>Copyright (c) 2012 Indeed, Inc All rights reserved.</copyright>
<lastBuildDate>Sun, 25 Mar 2012 19:43:15 GMT</lastBuildDate>
<image>
<url>http://www.indeed.com/images/indeed_rss.png</url>
<title>Indeed.com - one search. all jobs.</title>
<link>http://www.indeed.com/</link>
</image>
<item>
<title>Software Engineer with ASP.Net applications experience - The Integrity Group - Andover, MA</title>
<link>http://www.indeed.com/job/Software-Engineer-With-ASP-Net-Application-Experience-at-The-Integrity-Group-in-Andover,-MA-6ab16ba39cc13536</link>
<source>JobHost</source>
<guid isPermaLink=""false"">d61c3504e9df0fc13be4abb4be209c38</guid>
<pubDate>Wed, 21 Mar 2012 05:04:47 GMT</pubDate>
<description>layers and preferably service based architecture. ASP.Net web applications with server-side controls and... Solid experience in ASP.Net, SQL Server, C#, XML, XML... <br/>
From JobHost - 21 Mar 2012 05:04:47 GMT
- View all <a href="http://www.indeed.com/l-Andover,-MA-jobs.html">Andover jobs</a>
</description>
<georss:point>42.64835 -71.15934</georss:point>
</item>
</channel></rss>";
foreach (SavedJob sj in GetJobs(xml))
{
Console.WriteLine(sj.title);
}
}
public static List<SavedJob> GetJobs(string xml)
{
//http://forum.unity3d.com/threads/31314-Include-Files-in-build
XmlDocument xmlDoc = new XmlDocument();
System.IO.StringReader stringReader = new System.IO.StringReader(xml);
stringReader.Read();
//http://unity3d.qatohost.com/questions/161528/loading-a-large-xml-file-200-multi-level-nodes-int.html
// skip BOM
xmlDoc.LoadXml(stringReader.ReadToEnd());
List<SavedJob> SavedJobs = new List<SavedJob>();
try
{
foreach (XmlElement found in xmlDoc.GetElementsByTagName("item"))
{
SavedJob sj = new SavedJob();
sj.title = found.GetElementsByTagName("title")[0].InnerText;
sj.link = found.GetElementsByTagName("link")[0].InnerText;
sj.description = found.GetElementsByTagName("description")[0].InnerText;
DateTime dt = new DateTime();
DateTime.TryParse(found.GetElementsByTagName("pubDate")[0].InnerText, out dt);
sj.date = dt;
SavedJobs.Add(sj);
}
}
//data source is null
catch (Exception)
{
}
return SavedJobs;
}
}
public class SavedJob
{
public string title { get; set; }
public string link { get; set; }
public string description { get; set; }
public DateTime date { get; set; }
}
My goal is to create a loading panel in MonoTouch, however, I can't seem to download the string because the XML is bad. Is there a way around this?
LoadingView lv = new LoadingView ();
WebClient wc = new WebClient ();
wc.DownloadStringCompleted += delegate(object sender, DownloadStringCompletedEventArgs e) {
// We got the async result now display data
InvokeOnMainThread (delegate {
if (e.Result != null) {
SavedJobs = basicOperations.GetJobs (e.Result);
TableView.Source = new DataSource (this);
TableView.ReloadData();
lv.Hide ();
}
});
};
lv.Show ("Loading");
wc.DownloadStringAsync (new Uri (uString));
Your code shows:
System.IO.StringReader stringReader = new System.IO.StringReader(xml);
stringReader.Read();
xmlDoc.LoadXml(stringReader.ReadToEnd());
What this does is:
create a string reader on ""
then read the first character - "<"
then try to load the remaining characters as xml - i.e. "xml ...>"
So you need to avoid the initial stringReader.Read(); call which is stripping the first opening angle bracket from the xml - making it invalid at 1,1
An even better way of doing this is creating a file from DownloadFileAsync:
WebClient wc = new WebClient();
string fPath = Environment.GetFolderPath(Environment.SpecialFolder.Personal) + "/myfile.xml";
wc.DownloadFileCompleted += delegate(object sender, AsyncCompletedEventArgs e) {
BasicOperations bas = new BasicOperations();
//save results as file
SavedJobs = bas.GetJobs(fPath);
TableView.Source = new DataSource (this);
TableView.ReloadData();
};
wc.DownloadFileAsync(new Uri(uString), fPath);
Related
I have searched on Google and here on Stack Overflow, but due to my limited expertise in C# and SSIS I have some troubles getting through.
My steps in this SSIS package is as follows:
I have a SQL task connected to a for loop which only specifies how many times I should iterate in Forloop container which is =1
Inside Forloop container have a dataflow task
In dataflow task, I use an OLEDB source to get addresses from a SQL table
I connect those OLEDB source to a Script task from which the following script "Should" return my XML answer from which I want to return Latitude and longitude. It does not return the answer that I want.
Example address is - Beihinger Strasse 160 DE-71726 BENNINGEN GERMANY
public override void Input0_ProcessInputRow(Input0Buffer Row)
{
string url = "http://dev.virtualearth.net/REST/v1/Locations?addressLine={0}&output=xml&key={1}";
var Address = Row.Address;
var wGet = WebRequest.Create(String.Format(url, Address, Variables.APIkey));
wGet.Method = WebRequestMethods.Http.Get;
var response = wGet.GetResponse();
var status = ((HttpWebResponse)response).StatusDescription;
Console.WriteLine("Input: " + Address);
if (status == "OK")
{
StreamReader reader = new StreamReader(response.GetResponseStream());
string responseFromServer = reader.ReadToEnd();
reader.Close();
var xmlDoc = new XmlDocument();
xmlDoc.LoadXml(responseFromServer);
var nsmgr = new XmlNamespaceManager(xmlDoc.NameTable);
nsmgr.AddNamespace("Point", "http://schemas.microsoft.com/search/local/ws/rest/v1");
var latNode = xmlDoc.DocumentElement.SelectSingleNode("/ResourceSets/ResourceSet/Resources/Location/Point/Latitude", nsmgr);
var lngNode = xmlDoc.DocumentElement.SelectSingleNode("/ResourceSets/ResourceSet/Resources/Location/Point/Longitude", nsmgr);
if (latNode != null && lngNode != null)
{
var numberFormatInfo = new NumberFormatInfo { NumberDecimalSeparator = "." };
Row.Latitude = decimal.Parse(latNode.InnerText, numberFormatInfo);
Row.Longitude = decimal.Parse(lngNode.InnerText, numberFormatInfo);
Row.FullResponse = responseFromServer.ToString();
}
else
{
//Set to null
Row.Latitude_IsNull = true;
Row.Longitude_IsNull = true;
Row.FullResponse = responseFromServer.ToString();
}
}
else
{
//Set to null
Row.Latitude_IsNull = true;
Row.Longitude_IsNull = true;
Row.FullResponse_IsNull = true;
}
response.Close();
}
}
The response I get is:
<?xml version="1.0" encoding="utf-8"?><Response xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://schemas.microsoft.com/search/local/ws/rest/v1"><Copyright>Copyright © 2020 Microsoft and its suppliers. All rights reserved. This API cannot be accessed and the content and any results may not be used, reproduced or transmitted in any manner without express written permission from Microsoft Corporation.</Copyright><BrandLogoUri>http://dev.virtualearth.net/Branding/logo_powered_by.png</BrandLogoUri><StatusCode>200</StatusCode><StatusDescription>OK</StatusDescription><AuthenticationResultCode>ValidCredentials</AuthenticationResultCode><TraceId>b9a94b411dc246ea82d161dae5e7a0c3|DU00000D73|0.0.0.1|Ref A: ACDD23537F6649F6B2F66333BB6D6D01 Ref B: DB3EDGE0918 Ref C: 2020-02-12T11:25:05Z</TraceId><ResourceSets><ResourceSet><EstimatedTotal>0</EstimatedTotal><Resources /></ResourceSet></ResourceSets></Response>
I want a correct call as I get when using URL so I can get Longitude and Latitude- http://dev.virtualearth.net/REST/v1/Locations?addressLine=Beihinger%20Strasse%20160%20DE-71726%20BENNINGEN%20GERMANY&output=xml&key={APIkey}
Any help would be greatly appreciated to what I am missing?
I'm downloading files from links and i saved them so far in one directory.
But now i need to download each group of files to a specific directory.
This is the content of the first text file i added as existing item to my project. The countries names.
Europe
Alps
Benelux
Germany
Spain & Portugal
France
Greece
Italy
Poland
Scandinavia
Turkey
UK & Ireland
Russia
Baltic
Balkan
Romania & Bulgaria
Hungary
Africa
Algeria
Cameroon
CanaryIslands
Congo
CentralAfrica
Nigeria
Chad
Egypt
Ethiopia
Israel
Libya
Madagascar
Morocco
Namibia
SaudiArabia
Somalia
SouthAfrica
Sudan
Tanzania
Tunesia
WestAfrica
Zambia
And i have another text file that present each country code:
eu
alps
nl
de
sp
fr
gr
it
pl
scan
tu
gb
ru
bc
ba
se
hu
af
dz
cm
ce
cg
caf
ng
td
eg
et
is
ly
mg
mo
bw
sa
so
za
sd
tz
tn
wa
zm
Why the codes are important ? since each link is built inside with the country code not name. For example:
http://www.sat24.com/image2.ashx?region=is&time=201612271600&ir=true
So i know that in the link the part region=is meaning that the country code in this case is: 'is' (Israel).
So now i need to find all the links that have the code 'is' should be downloading to the directory of Israel.
Next when the links will be with other regions for example:
http://www.sat24.com/image2.ashx?region=tu&time=201612271600&ir=true
So now the code tu is for the country Turkey so now every link with the code tu should be downloading to the Turkey directory.
So the first problem is how to connect between the code of the country in the links to the country name and then to download it to the correct country name directory that i allready created in the constructor ? I have all the countries names directories allready but i need somehow to connect between the codes(regions) in the links to the country name directory.
The second problem is that i'm making this downloads every 15 minutes. I will do it later with a timer. Every 15 minutes download the images again. But i don't want to delete or overwrite the old images the main idea is to save and keep the images. The problem is in each country every 15 minutes what sub paths should i create ? I mean what name to give each country sub paths every 15 minutes ?
I thought to create for each country a directory with the date and time range of the downloaded images but i'm not sure if it's a good idea.
Later i will want to be able to move between this directories of each country and that it will load the images to a pictureBox. The question is how i will be able to identify each directory that was downloaded every 15 minutes ?
The problem is not the downloading the program is working.
The two problems is with the directories.
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;
using System.Diagnostics;
using System.IO;
using System.Net;
using System.Text.RegularExpressions;
namespace Downloader
{
public partial class Form1 : Form
{
int countCompleted = 0;
ExtractImages ei = new ExtractImages();
List<string> newList = new List<string>();
List<string> countryList = new List<string>();
List<string> countriesPaths = new List<string>();
private Queue<string> _downloadUrls = new Queue<string>();
public Form1()
{
InitializeComponent();
ManageDirectories();
lblDownloads.Text = "0";
ei.Init();
foreach (ExtractImages.Continent continent in ei.world.continents)
{
foreach (ExtractImages.Country country in continent.countries)
{
if (country.name == "Israel")
{
foreach (string imageUri in country.imageUrls)
{
countryList.Add(imageUri);
}
}
else
{
foreach (string imageUri in country.imageUrls)
{
newList.Add(imageUri);
}
}
}
}
}
private void ManageDirectories()
{
string savedImagesPath = Path.GetDirectoryName(Application.LocalUserAppDataPath);
string mainPath = "Countries";
mainPath = Path.Combine(savedImagesPath, mainPath);
string[] lines = File.ReadAllLines("CountriesNames.txt");
foreach(string path in lines)
{
string countryPath = Path.Combine(mainPath, path);
if (!Directory.Exists(countryPath))
{
Directory.CreateDirectory(countryPath);
}
countriesPaths.Add(countryPath);
}
string[] countriesCodes = File.ReadAllLines("CountriesCodes.txt");
}
private void downloadFile(IEnumerable<string> urls)
{
foreach (var url in urls)
{
_downloadUrls.Enqueue(url);
}
// Starts the download
btnStart.Text = "Downloading...";
btnStart.Enabled = false;
pbStatus.Visible = true;
DownloadFile();
}
int count = 0;
private void DownloadFile()
{
if (_downloadUrls.Any())
{
WebClient client = new WebClient();
client.DownloadProgressChanged += client_DownloadProgressChanged;
client.DownloadFileCompleted += client_DownloadFileCompleted;
var url = _downloadUrls.Dequeue();
//string FileName = url.Substring(url.LastIndexOf("/") + 1,
// (url.Length - url.LastIndexOf("/") - 1));
client.DownloadFileAsync(new Uri(url), countriesPaths[count] + ".gif");
RichTextBoxExtensions.AppendText(richTextBox1, "Downloading: ", Color.Red);
RichTextBoxExtensions.AppendText(richTextBox1, url, Color.Green);
richTextBox1.AppendText(Environment.NewLine);
count++;
return;
}
// End of the download
btnStart.Text = "Download Complete";
countCompleted = newList.Count;
lblDownloads.Text = countCompleted.ToString();
timer1.Enabled = true;
downloadFile(newList);
}
private void client_DownloadFileCompleted(object sender, AsyncCompletedEventArgs e)
{
if (e.Error != null)
{
RichTextBoxExtensions.UpdateText(richTextBox1, "Downloading: ", "Downloaded: ", Color.Red);
// handle error scenario
throw e.Error;
}
else
{
countCompleted--;
lblDownloads.Text = countCompleted.ToString();
RichTextBoxExtensions.UpdateText(richTextBox1, "Downloading: ", "Downloaded: ", Color.Green);
}
if (e.Cancelled)
{
// handle cancelled scenario
}
DownloadFile();
}
void client_DownloadProgressChanged(object sender, DownloadProgressChangedEventArgs e)
{
double bytesIn = double.Parse(e.BytesReceived.ToString());
double totalBytes = double.Parse(e.TotalBytesToReceive.ToString());
double percentage = bytesIn / totalBytes * 100;
pbStatus.Value = int.Parse(Math.Truncate(percentage).ToString());
}
private void Form1_Load(object sender, EventArgs e)
{
}
private void btnStart_Click(object sender, EventArgs e)
{
countCompleted = countryList.Count;
lblDownloads.Text = countCompleted.ToString();
downloadFile(countryList);
}
public class RichTextBoxExtensions
{
public static void AppendText(RichTextBox box, string text, Color color)
{
box.SelectionStart = box.TextLength;
box.SelectionLength = 0;
box.SelectionColor = color;
box.AppendText(text);
box.SelectionColor = box.ForeColor;
}
public static void UpdateText(RichTextBox box, string find, string replace, Color? color)
{
box.SelectionStart = box.Find(find, RichTextBoxFinds.Reverse);
box.SelectionLength = find.Length;
box.SelectionColor = color ?? box.SelectionColor;
box.SelectedText = replace;
}
}
private void timer1_Tick(object sender, EventArgs e)
{
}
private void SortList()
{
}
}
}
If the two files are ordered as they are in the example you can create dictionary form code to full name the other way around.
var codes = new List<string>() { ..... }
var countries = new List<string> () { ....}
creating the dictionary will look like this
var codeToFullNameMap = codes
.Select((code, index) => index)
.ToDictionary(
keySelector: index => codes[index].
elementSelector: index => cointires[index]);
Then having the dictionary created you can access the full country name by its code.
var countryName = codeToFullNameMap["tu"];
About the second question. If the naming gives you enough information and by using the current date time it think it does, should be pefrectly fine.
My suggestion is to create Directory for the country code and sub folder for each download start time (including hours, minutes and seconds will be enough i guess).
TU // (main folder)
-> 2017-01-26-18-50-10 // (subfolder 1)
-> img#1
-> img#2
-> 2017-01-26-18-50-10 // (subfolder 2)
I am not sure about the third question. If your folder is called TU, this is enough to tell that the images are for Turkey, or i am missing something ? And each sub folder contains images downloaded at interval of 15 minutes from the last one.
Maybe if you share more specifics about the third question, where exactly you see the problem, for example you need to visualize the images in desktop application by countries ?
Just use a generic dictionary to bind the country codes to the full name used in the directory path
Dictionary<string, string> countryCodeMapping = new Dictionary<string, string>() {
{"us", "United States"},
{"is", "Isreal"}
...
};
I have a xml file which looks like below.
<sections>
<section>
<name>QLD Mosques</name>
<stations>
<station>
<id>101</id>
<pn>true</pn>
<name>Kuraby Mosque</name>
<url>http://sb2110.ultrastream.co.uk/kuraby</url>
<icon>...</icon>
</station>
<station>
<id>102</id>
<pn>true</pn>
<name>Gold Coast Mosque</name>
<url>http://sb2110.ultrastream.co.uk/goldcoast</url>
<icon>http://www.juju.net.au/mosquereceivers/images/icons/gc.jpg</icon>
</station>
<station>...</station>
</stations>
</section>
<section>
<name>NZ Mosques</name>
<stations>...</stations>
</section>
<section>
<name>Islamic Radio Stations</name>
<stations>...</stations>
</section>
</sections>
I want get show all the station name which has "section" named "QLD Mosques".
For example my result will be "Kuraby Mosque,Gold Coast Mosque,...".
How can i achieve the result??
N:B:
I can show the names under "section" tag(which gives the result QLD Mosques,NZ Mosques,Islamic Radio Stations) by using these code:
public static List<MyData> channel_main_list = new List<MyData>();
public MainChannelList()
{
InitializeComponent();
WebClient client = new WebClient();
client.OpenReadCompleted += client_OpenReadCompleted;
client.OpenReadAsync(new Uri("http://www.juju.net.au/mosquereceivers/Stations.xml",UriKind.Absolute));
}
void client_OpenReadCompleted(object sender, OpenReadCompletedEventArgs e)
{
if (e.Error != null)
return;
Stream str = e.Result;
string node = "section";
XDocument loadedData = XDocument.Load(str);
try
{
foreach (var item in loadedData.Descendants(node))
{
try
{
MyData m = new MyData();
m.channel_name = item.Element("name").Value;
channel_main_list.Add(m);
}
catch (Exception)
{
MessageBox.Show("Problem");
}
}
listBox.ItemsSource = channel_main_list;
}
catch (Exception)
{
MessageBox.Show("Connectivity Problem");
}
}
This is one possible way, assuming that the XML structure is consistent and name being searched is always found :
var result = loadedData.Descendants("section")
.Where(o => (string)o.Element("name") == "QLD Mosques")
.Elements("stations")
.Elements("station")
.Elements("name");
Dotnetfiddle Demo
Is there a way to use the SQL Server 2012 Microsoft.SqlServer.Dac Namespace to determine if a database has an identical schema to that described by a DacPackage object? I've looked at the API docs for DacPackage as well as DacServices, but not having any luck; am I missing something?
Yes there is, I have been using the following technique since 2012 without issue.
Calculate a fingerprint of the dacpac.
Store that fingerprint in the target database.
The .dacpac is just a zip file containing goodies like metadata, and
model information.
Here's a screen-grab of what you will find in the .dacpac:
The file model.xml has XML structured like the following
<DataSchemaModel>
<Header>
... developer specific stuff is in here
</Header>
<Model>
.. database model definition is in here
</Model>
</<DataSchemaModel>
What we need to do is extract the contents from <Model>...</Model>
and treat this as the fingerprint of the schema.
"But wait!" you say. "Origin.xml has the following nodes:"
<Checksums>
<Checksum Uri="/model.xml">EB1B87793DB57B3BB5D4D9826D5566B42FA956EDF711BB96F713D06BA3D309DE</Checksum>
</Checksums>
In my experience, this <Checksum> node changes regardless of a schema change in the model.
So let's get to it.
Calculate the fingerprint of the dacpac.
using System.IO;
using System.IO.Packaging;
using System.Security.Cryptography;
static string DacPacFingerprint(byte[] dacPacBytes)
{
using (var ms = new MemoryStream(dacPacBytes))
using (var package = ZipPackage.Open(ms))
{
var modelFile = package.GetPart(new Uri("/model.xml", UriKind.Relative));
using (var streamReader = new System.IO.StreamReader(modelFile.GetStream()))
{
var xmlDoc = new XmlDocument() { InnerXml = streamReader.ReadToEnd() };
foreach (XmlNode childNode in xmlDoc.DocumentElement.ChildNodes)
{
if (childNode.Name == "Header")
{
// skip the Header node as described
xmlDoc.DocumentElement.RemoveChild(childNode);
break;
}
}
using (var crypto = new SHA512CryptoServiceProvider())
{
byte[] retVal = crypto.ComputeHash(Encoding.UTF8.GetBytes(xmlDoc.InnerXml));
return BitConverter.ToString(retVal).Replace("-", "");// hex string
}
}
}
}
With this fingerprint now available, pseudo code for applying a dacpac can be:
void main()
{
var dacpacBytes = File.ReadAllBytes("<path-to-dacpac>");
var dacpacFingerPrint = DacPacFingerprint(dacpacBytes);// see above
var databaseFingerPrint = Database.GetFingerprint();//however you choose to do this
if(databaseFingerPrint != dacpacFingerPrint)
{
DeployDacpac(...);//however you choose to do this
Database.SetFingerprint(dacpacFingerPrint);//however you choose to do this
}
}
Here's what I've come up with, but I'm not really crazy about it. If anyone can point out any bugs, edge cases, or better approaches, I'd be much obliged.
...
DacServices dacSvc = new DacServices(connectionString);
string deployScript = dacSvc.GenerateDeployScript(myDacpac, #"aDb", deployOptions);
if (DatabaseEqualsDacPackage(deployScript))
{
Console.WriteLine("The database and the DacPackage are equal");
}
...
bool DatabaseEqualsDacPackage(string deployScript)
{
string equalStr = string.Format("GO{0}USE [$(DatabaseName)];{0}{0}{0}GO{0}PRINT N'Update complete.'{0}GO", Environment.NewLine);
return deployScript.Contains(equalStr);
}
...
What I really don't like about this approach is that it's entirely dependent upon the format of the generated deployment script, and therefore extremely brittle. Questions, comments and suggestions very welcome.
#Aaron Hudon answer does not account for post script changes. Sometimes you just add a new entry to a type table without changing the model. In our case we want this to count as new dacpac. Here is my modification of his code to account for that
private static string DacPacFingerprint(string path)
{
using (var stream = File.OpenRead(path))
using (var package = Package.Open(stream))
{
var extractors = new IDacPacDataExtractor [] {new ModelExtractor(), new PostScriptExtractor()};
string content = string.Join("_", extractors.Select(e =>
{
var modelFile = package.GetPart(new Uri($"/{e.Filename}", UriKind.Relative));
using (var streamReader = new StreamReader(modelFile.GetStream()))
{
return e.ExtractData(streamReader);
}
}));
using (var crypto = new MD5CryptoServiceProvider())
{
byte[] retVal = crypto.ComputeHash(Encoding.UTF8.GetBytes(content));
return BitConverter.ToString(retVal).Replace("-", "");// hex string
}
}
}
private class ModelExtractor : IDacPacDataExtractor
{
public string Filename { get; } = "model.xml";
public string ExtractData(StreamReader streamReader)
{
var xmlDoc = new XmlDocument() { InnerXml = streamReader.ReadToEnd() };
foreach (XmlNode childNode in xmlDoc.DocumentElement.ChildNodes)
{
if (childNode.Name == "Header")
{
// skip the Header node as described
xmlDoc.DocumentElement.RemoveChild(childNode);
break;
}
}
return xmlDoc.InnerXml;
}
}
private class PostScriptExtractor : IDacPacDataExtractor
{
public string Filename { get; } = "postdeploy.sql";
public string ExtractData(StreamReader stream)
{
return stream.ReadToEnd();
}
}
private interface IDacPacDataExtractor
{
string Filename { get; }
string ExtractData(StreamReader stream);
}
Closed. This question needs details or clarity. It is not currently accepting answers.
Want to improve this question? Add details and clarify the problem by editing this post.
Closed 9 years ago.
Improve this question
I'm new in a C# and WinForms I want to create a web crawler (parser) - which can parse a web pages and showing them hierarchically. + I don't know how to make bot crawling with a specific hyper-link depth.
So I think I have 2 questions:
How to make bot crawling with specified link depth?
How to show all hyperlinks hierarchically?
P.S. I would be great if it'll be a code samples.
P.P.S. have 1 button = button1; and 1 richtextbox = richTextBox1;
Here is my code: I know it's very ugly.... (all code in a one button):
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}
private void button1_Click(object sender, EventArgs e)
{
//Declaration
HttpWebRequest request = (HttpWebRequest) WebRequest.Create(url);
HttpWebResponse response = (HttpWebResponse) request.GetResponse();
StreamReader sr = new StreamReader(response.GetResponseStream());
Match m;
string anotherTest = #"(((ht){1}tp[s]?://)[-a-zA-Z0-9#:%_\+.~#?&\\]+)";
List<string> savedUrls = new List<string>();
List<string> titles = new List<string>();
//Go to this URL:
string url = UrlTextBox.Text = "http://www.yahoo.com";
if (!(url.StartsWith("http://") || url.StartsWith("https://")))
url = "http://" + url;
//Scrape Whole Html code:
string s = sr.ReadToEnd();
try
{
// Get Urls:
m = Regex.Match(s, anotherTest,
RegexOptions.IgnoreCase | RegexOptions.Compiled,
TimeSpan.FromSeconds(1));
while (m.Success)
{
savedUrls.Add(m.Groups[1].ToString());
m = m.NextMatch();
}
// Get TITLES:
Match m2 = Regex.Match(s, #"<title>\s*(.+?)\s*</title>");
if (m2.Success)
{
titles.Add(m2.Groups[1].Value);
}
//Show Title:
richTextBox1.Text += titles[0] + "\n";
//Show Urls:
TrimUrls(ref savedUrls);
}
catch (RegexMatchTimeoutException)
{
Console.WriteLine("The matching operation timed out.");
}
sr.Close();
}
private void TrimUrls(ref List<string> urls)
{
List<string> d = urls.Distinct().ToList();
foreach (var v in d)
{
if (v.IndexOf('.') != -1 && v != "http://www.w3.org")
{
richTextBox1.Text += v + "\n";
}
}
}
}
}
And one more question:
Is Anybody know how to save it in XML like a tree?
I would also highly recommend you the HTML Agility Pack.
With the Html Agility Pack you can do something like:
var doc = new HtmlDocument();
doc.LoadHtml(html);
var urls = new List<String>();
doc.DocumentNode.SelectNodes("//a").ForEach(x =>
{
urls.Add(x.Attributes["href"].Value);
});
Edit:
You can do something like this, but please add some exception handling to it.
public class ParsResult
{
public ParsResult Parent { get; set; }
public String Url { get; set; }
public Int32 Depth { get; set; }
}
__
private readonly List<ParsResult> _results = new List<ParsResult>();
private Int32 _maxDepth = 5;
public void Foo(String urlToCheck = null, Int32 depth = 0, ParsResult parent = null)
{
if (depth >= _maxDepth) return;
String html;
using (var wc = new WebClient())
html = wc.DownloadString(urlToCheck ?? parent.Url);
var doc = new HtmlDocument();
doc.LoadHtml(html);
var aNods = doc.DocumentNode.SelectNodes("//a");
if (aNods == null || !aNods.Any()) return;
foreach (var aNode in aNods)
{
var url = aNode.Attributes["href"];
if (url == null)
continue;
var result = new ParsResult
{
Depth = depth,
Parent = parent,
Url = url.Value
};
_results.Add(result);
Console.WriteLine("{0} - {1}", depth, result.Url);
Foo(depth: depth + 1, parent: result);
}
If you need parse such structured data (xhtml), try to look at xpath: http://msdn.microsoft.com/en-us/library/ms256086.aspx
(You should also put your logic in to dedicated objects, not just let it be in GUI layer. You will appreciate it later.)