I am trying to perform a GET request to https://sede.educacion.gob.es/publiventa/catalogo.action?cod=E; with the cod=E parameter, in the browser, the web site open a menu below "Materias de educación", but when I perform the request using C# this menu is not loading and I need it. This is the code I am using to readHtml as string to later parse it with HtmlAgilityPack.
private string readHtml(string urlAddress)
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(urlAddress);
request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0";
request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
request.AutomaticDecompression = DecompressionMethods.GZip;
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
if (response.StatusCode == HttpStatusCode.OK)
{
Stream receiveStream = response.GetResponseStream();
StreamReader readStream = null;
if (response.CharacterSet == null)
{
readStream = new StreamReader(receiveStream);
}
else
{
readStream = new StreamReader(receiveStream, Encoding.GetEncoding(response.CharacterSet));
}
string data = readStream.ReadToEnd();
response.Close();
readStream.Close();
return data;
}
return null;
}
The Uri you posted (https://sede.educacion.gob.es/publiventa/catalogo.action?cod=E) uses a Javascript switch to show the Menu content.
When you connect to that Uri (without clicking a menu link), that site shows three different versions of that page.
1) Page with closed menu and proposed new editions
2) Page with closed menu and search engine fields
3) Page with open menu and a selection of the menu content
This switch is based on a internal procedure which records the current session. Unless you click on a menu link (which is connected to an event listener), the Javascript proc shows the page in different states.
I gave it a look; those script are quite long (a whole multi-purpose library) and I had no time to parse it all (may be you can do that) to find out what parameters the event listener is passing.
But, the three-state version switch is constant.
What I mean is you can call that page three times, preserving the Cookie Container: the third time you connect to it, it will stream the whole menu content and its links.
If you request three times the same page, the third time the Html page will
contain all theMaterias de educación links
public async void SomeMethodAsync()
{
string HtmlPage = await GetHttpStream([URI]);
HtmlPage = await GetHttpStream([URI]);
HtmlPage = await GetHttpStream([URI]);
}
This is, more or less, what I used to get that page:
CookieContainer CookieJar = new CookieContainer();
public async Task<string> GetHttpStream(Uri HtmlPage)
{
HttpWebRequest httpRequest;
string Payload = string.Empty;
httpRequest = WebRequest.CreateHttp(HtmlPage);
try
{
httpRequest.CookieContainer = CookieJar;
httpRequest.KeepAlive = true;
httpRequest.ConnectionGroupName = Guid.NewGuid().ToString();
httpRequest.AllowAutoRedirect = true;
httpRequest.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate;
httpRequest.ServicePoint.MaxIdleTime = 30000;
httpRequest.ServicePoint.Expect100Continue = false;
httpRequest.UserAgent = "Mozilla/5.0 (Windows NT 10; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0";
httpRequest.Accept = "ext/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
httpRequest.Headers.Add(HttpRequestHeader.AcceptLanguage, "es-ES,es;q=0.8,en-US;q=0.5,en;q=0.3");
httpRequest.Headers.Add(HttpRequestHeader.AcceptEncoding, "gzip, deflate;q=0.8");
httpRequest.Headers.Add(HttpRequestHeader.CacheControl, "no-cache");
using (HttpWebResponse httpResponse = (HttpWebResponse)await httpRequest.GetResponseAsync())
{
Stream ResponseStream = httpResponse.GetResponseStream();
if (httpResponse.StatusCode == HttpStatusCode.OK)
{
try
{
//ResponseStream.Position = 0;
Encoding encoding = Encoding.GetEncoding(httpResponse.CharacterSet);
using (MemoryStream _memStream = new MemoryStream())
{
if (httpResponse.ContentEncoding.Contains("gzip"))
{
using (GZipStream _gzipStream = new GZipStream(ResponseStream, System.IO.Compression.CompressionMode.Decompress))
{
_gzipStream.CopyTo(_memStream);
};
}
else if (httpResponse.ContentEncoding.Contains("deflate"))
{
using (DeflateStream _deflStream = new DeflateStream(ResponseStream, System.IO.Compression.CompressionMode.Decompress))
{
_deflStream.CopyTo(_memStream);
};
}
else
{
ResponseStream.CopyTo(_memStream);
}
_memStream.Position = 0;
using (StreamReader _reader = new StreamReader(_memStream, encoding))
{
Payload = _reader.ReadToEnd().Trim();
};
};
}
catch (Exception)
{
Payload = string.Empty;
}
}
}
}
catch (WebException exW)
{
if (exW.Response != null)
{
//Handle WebException
}
}
catch (System.Exception exS)
{
//Handle System.Exception
}
CookieJar = httpRequest.CookieContainer;
return Payload;
}
Related
I have a little problem with cookie handling in C#
So on my web site, I have a login page, once logged in, I am redirected to the home page. I get with HttpWebRequest to connect and follow the redirection, I created a class, here it is :
class webReq
{
private string urlConnection;
private string login;
private string password;
private CookieCollection cookieContainer;
private long executionTime = 0;
public webReq(string urlCo, string login, string pass)
{
this.urlConnection = urlCo;
this.login = login;
this.password = pass;
this.cookieContainer = null;
}
public void StartConnection()
{
string WriteHTML = "D:/REM/Connection.html";
List<string> datas = new List<string>();
datas.Add("Username=" + this.login);
datas.Add("Password=" + this.password);
datas.Add("func=ll.login");
datas.Add("NextURL=/admin/livelink.exe");
datas.Add("loginbutton=Sign in");
string postData = "";
postData = string.Join("&", datas);
var buffer = Encoding.ASCII.GetBytes(postData);
try
{
var watch = System.Diagnostics.Stopwatch.StartNew();
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(this.urlConnection);
request.AllowAutoRedirect = true;
request.Method = "POST";
request.UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1003.1 Safari/535.19";
request.Accept = "text/html, application/xhtml+xml, */*";
request.ContentType = "application/x-www-form-urlencoded";
request.ContentLength = buffer.Length;
request.CookieContainer = new CookieContainer();
Stream stream = request.GetRequestStream();
stream.Write(buffer, 0, buffer.Length);
stream.Close();
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
stream = response.GetResponseStream();
watch.Stop();
this.executionTime = watch.ElapsedMilliseconds;
StreamReader reader = new StreamReader(stream);
System.IO.File.WriteAllText(WriteHTML, reader.ReadToEnd());
this.cookieContainer = new CookieCollection();
foreach (Cookie cookie in response.Cookies)
{
this.cookieContainer.Add(cookie);
}
}
catch (WebException ex)
{
Console.WriteLine(ex.GetBaseException().ToString());
}
}
}
I load the home page well, and I manage to get a cookie.
So I developed a function to use my cookie to browse the website :
public void connectUrl(string url, int numeroTest)
{
string WriteHTML = "D:/REM/Page"+numeroTest+".html";
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
//Add cookie to request.CookieContainer
request.CookieContainer = new CookieContainer();
request.CookieContainer.Add(this.cookieContainer);
var watch = System.Diagnostics.Stopwatch.StartNew();
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream stream = response.GetResponseStream();
watch.Stop();
this.executionTime = watch.ElapsedMilliseconds;
StreamReader reader = new StreamReader(stream);
System.IO.File.WriteAllText(WriteHTML, reader.ReadToEnd());
}
Normally, I have to retrieve three cookies, like on the website :
Only, I can't navigate on the website, I end up on the login page, the cookies are not good, and that I'm in debug, I only loaded one cookie(BrowseSettings) out of the three(LLCookie & LLTZCookie) :
I don't understand why I can't retrieve all the cookies on the website.... If anyone has a solution!
I found the reason why I can't get all the cookies, even if I can't find exactly why it works by disabling redirection, in my StartConnection() method :
request.AllowAutoRedirect = true;
I try to getting data from google play web page with C# HttpWebRequest but when it's response I got difference result
Code:
public const string googlePlayUrl = "https://play.google.com/store/apps/details?id=";
public void GetData(string packageName) {
HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(new Uri(googlePlayUrl + packageName));
request.Method = WebRequestMethods.Http.Get;
request.ContentType = "text/html";
request.UserAgent = "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:19.0) Gecko/20100101 Firefox/19.0";
request.AutomaticDecompression = DecompressionMethods.GZip;
request.BeginGetResponse((IAsyncResult asynchronousResult) =>
{
HttpWebRequest requested = (HttpWebRequest)asynchronousResult.AsyncState;
using (HttpWebResponse response = (HttpWebResponse)requested.EndGetResponse(asynchronousResult))
{
System.IO.Stream responseStream = response.GetResponseStream();
using (StreamReader reader = new StreamReader(responseStream))
{
Console.WriteLine(reader.ReadToEnd());
}
responseStream.Close();
}
}, request);
}
Request connection is fine, I got response but it's difference than when I access the web-page with browser. It's no elements that I want to use such as
div.id-app-title
span attr[itemprop="genre"]
div attr[itemprop="description"]
Not sure why, I've try to set its user-agent but it still not work or maybe I set it wrong.
Wish someone have solution for that :)
Assuming your public IP address has not been blocked by Google, you can use the synchronous method request.GetResponse() together with the Parallel.ForEach() as shown below:
public static string GetDataSync(string packageName)
{
string result = "";
Uri uri = new Uri(googlePlayUrl + packageName);
var request = HttpWebRequest.Create(uri);
var response = request.GetResponse();
var responseStream = response.GetResponseStream();
using (StreamReader reader = new StreamReader(responseStream))
{
result = (reader.ReadToEnd());
}
responseStream.Close();
return result;
}
Call the method above using Parallel.ForEach and a tread-safe collection ConcurrentDictionary to store the html string result per package:
IEnumerable<string> appPackages = new List<string>() {
"com.google.android.apps.youtube.music",
"com.netflix.mediaclient"
};
ConcurrentDictionary<string, string> results =
new ConcurrentDictionary<string, string>(Environment.ProcessorCount, appPackages.Count());
Parallel.ForEach(appPackages, (app) =>
{
results.TryAdd(app, GetDataSync(app));
});
I'm trying to get the pronunciation for certain word from a web dictionary. For example, in the following code, I want to get the pronunciation of good from http://collinsdictionary.com
(HTTP Agility Pack is used here)
static void test()
{
String url = "http://www.collinsdictionary.com/dictionary/english/good";
WebClient client = new WebClient();
client.Encoding = System.Text.Encoding.UTF8;
String html = client.DownloadString(url);
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(html);
HtmlAgilityPack.HtmlNode node = doc.DocumentNode.SelectSingleNode("//*[#id=\"good_1\"]/div[1]/h2/span/text()[1]");
if (node == null)
{
Console.WriteLine("XPath not found.");
}
else
{
Console.WriteLine(node.WriteTo());
}
}
I was expecting
(ɡʊd
but what I could get at best is
(ɡ?d
How to get it right?
The problem is not in your parsing of the text, rather it is a problem with the console output. If you are doing this from a command line app, you can set the output encoding of the console to be unicode:
Console.OutputEncoding = System.Text.Encoding.Unicode;
You need to also ensure that your font in the console is a font that has unicode support. See this answer for more info.
If you know the page encoding (e.g System.Text.Encoding.UTF8);
string html = DownloadSmallFiles_String(url, System.Text.Encoding.UTF8, 20000);
or use automatic encoding detection (depends on server response)
string html = DownloadSmallFiles_String(url, null, 20000);
and finally load the html
doc.LoadHtml(html);
Try below code
static void test()
{
String url = "http://www.collinsdictionary.com/dictionary/english/good";
System.Text.Encoding PageEncoding = null; //System.Text.Encoding.UTF8
//PageEncoding = null; it means try to detect encoding automatically
string html = DownloadSmallFiles_String(url, PageEncoding, 20000);
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
//doc.LoadHtml(html);
doc.LoadHtml(html);
HtmlAgilityPack.HtmlNode node = doc.DocumentNode.SelectSingleNode("//*[#id=\"good_1\"]/div[1]/h2/span/text()[1]");
if (node == null)
{
Console .WriteLine("XPath not found.");
}
else
{
Console.WriteLine(node.WriteTo());
}
}
private static HttpWebRequest CreateWebRequest(string url, int TimeOut = 20000)
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
request.UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko";
request.Method = "GET";
request.Timeout = TimeOut;
request.CachePolicy = new HttpRequestCachePolicy(HttpRequestCacheLevel.NoCacheNoStore);
request.KeepAlive = false;
request.UseDefaultCredentials = true;
request.Proxy = null;//ProxyHelperClass.GetIEProxy;
return request;
}
public static string DownloadSmallFiles_String(string Url, System.Text.Encoding ForceTextEncoding_SetThistoNothingToUseAutomatic, int TimeOut = 20000)
{
try
{
string ResponsString = "";
HttpWebRequest request = CreateWebRequest(Url, TimeOut);
using (HttpWebResponse response = (HttpWebResponse)request.GetResponse())
{
if (response.StatusCode == HttpStatusCode.OK)
{
using (Stream receiveStream = response.GetResponseStream())
{
if (ForceTextEncoding_SetThistoNothingToUseAutomatic != null)
{
ResponsString = new StreamReader(receiveStream, ForceTextEncoding_SetThistoNothingToUseAutomatic).ReadToEnd();
}
else
{
if (string.IsNullOrEmpty(response.CharacterSet) == false)
{
System.Text.Encoding respEncoding = System.Text.Encoding.GetEncoding(response.CharacterSet);
ResponsString = new StreamReader(receiveStream, respEncoding).ReadToEnd();
}
else
{
ResponsString = new StreamReader(receiveStream).ReadToEnd();
}
}
}
}
}
return ResponsString;
}
catch (Exception ex)
{
return "";
}
}
i am trying to login to a forum with httpwerequests but i had no success so far, this is my code:
string url = "http://www.warriorforum.com/";
var bytes = Encoding.Default.GetBytes(#"vb_login_username=MyUsername&cookieuser=1&vb_login_password=&s=&securitytoken=guest&do=login&vb_login_md5password=d9350bad28eee253951d7c5211e50179&vb_login_md5password_utf=d9350bad28eee253951d7c5211e50179");
var container = new CookieContainer();
var request = (HttpWebRequest)(WebRequest.Create(url));
request.CookieContainer = container;
request.UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/22.0.1229.94 Safari/535.2";
request.ContentLength = bytes.Length;
request.Method = "POST";
request.KeepAlive = true;
request.AllowAutoRedirect = true;
request.AllowWriteStreamBuffering = true;
request.CookieContainer = container;
using (var requestStream = request.GetRequestStream())
requestStream.Write(bytes, 0, bytes.Length);
var requestResponse = request.GetResponse();
using (var responsStream = requestResponse.GetResponseStream())
{
if (responsStream != null)
{
using (var responseReader = new StreamReader(responsStream))
{
var responseStreamReader = responseReader.ReadToEnd();
richTextBox1.Text = responseStreamReader; //this is to read the page source after the request
}
}
}
After the request the response is just the same page, nothing changed, no message telling me that i input wrong password or something like that.
I just tested using my generic VBulletin login function and it seemed to work fine:
private static bool VBulletinLogin(Uri loginUrl, string user, string password)
{
var postParams = new[] {
new HttpParam("vb_login_username", user),
new HttpParam("cookieuser", "1"),
new HttpParam("vb_login_password", password),
new HttpParam("securitytoken", "guest"),
new HttpParam("do", "login"),
};
var http = new HttpContext();
var src = http.GetEncodedPageData(loginUrl, HttpRequestType.POST, postParams);
return src.ResponseData.Contains("Thank you for logging in");
}
Unfortunately, this uses my HttpContext class, which is part of a library I've been writing and the features are fairly intertwined. Hopefully, however, it will at least give you an idea of the post params. I've also included a few helpful structs/functions from my own class which should help. (note, requires a reference to the .NET 3.5 System.Web namespace.
First helpful struct, HttpParam:
public struct HttpParam
{
private string _key;
private string _value;
public string Key { get { return HttpUtilty.UrlEncode(_key); } set { _key = value; } }
public string Value { get { return HttpUtility.UrlEncode(_value); } set { _value = value; } }
public HttpParam(string key, string value)
{
_key = key;
_value = value;
}
public override string ToString()
{
return string.Format("{0}={1}", Key, Value);
}
};
And a function to go along with it:
private static string GetQueryString(HttpParam[] args)
{
return args != null
? string.Join("&", Array.ConvertAll(args, arg => arg.ToString()))
: string.Empty;
}
The combination of these will help you to generate consistent, safe query strings. So in the above case:
var postParams = new[] {
new HttpParam("vb_login_username", user),
new HttpParam("cookieuser", "1"),
new HttpParam("vb_login_password", password),
new HttpParam("securitytoken", "guest"),
new HttpParam("do", "login"),
};
var queryString = GetQueryString(postParams);
Would get you something like:
vb_login_username=<user>&cookieuser=1&vb_login_password=<password>&securitytoken=guest&do=login
Then something similar to what you already have for posting could be used, just ensure you have the correct URL. I'd also use UTF8 encoding when getting the query string bytes. For example (using your original code, slightly modified)
var postParams = new[] {
new HttpParam("vb_login_username", "yourusername"),
new HttpParam("cookieuser", "1"),
new HttpParam("vb_login_password", "yourpassword"),
new HttpParam("securitytoken", "guest"),
new HttpParam("do", "login"),
};
string url = "http://warriorforum.com/login.php?do=login";
var container = new CookieContainer();
var buffer = Encoding.UTF8.GetBytes(GetQueryString(postParams));
var request = (HttpWebRequest)HttpWebRequest.Create(url);
request.CookieContainer = container;
request.UserAgent = "Mozilla/5.0";
request.Method = "POST";
request.KeepAlive = true;
request.AllowAutoRedirect = true;
request.CookieContainer = container;
request.ContentLength = buffer.Length;
request.ContentType = "application/x-www-form-urlencoded; charset=UTF-8";
request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
using (var requestStream = request.GetRequestStream())
requestStream.Write(buffer, 0, buffer.Length);
using (var response = request.GetResponse())
{
if (response.StatusCode == HttpStatusCode.OK || response.StatusCode == HttpStatusCode.NotModified)
{
using (var reader = new StreamReader(response.GetResponseStream()))
{
var result = reader.ReadToEnd();
richTextBox1.Text = result; //this is to read the page source after the request
}
}
}
Note the changes with the ContentType as well.
You seem to be missing something the browser does when you login... does that forum really need a POST or perhaps a GET ? Are all your parameters correct ? Does the web page perhaps send an additional parameter (hidden) when login happens from the browser ?
You need to see what really goes over the wire when you login manually via a browser - use Wireshark or Fiddler to find out and then simulate what happens in code...
I am trying to scrape content from this page: https://www.google.com/search?hl=en&biw=1920&bih=956&tbm=shop&q=Xenon+12640&oq=Xenon+12640&aq=f&gs_l=serp.3...3743.3743.0.3905.1.1.0.0.0.0.0.0..0.0.ekh..0.0.Hq3XS7AxFDU&sei=Dr_MT_WOM6nO2AWE25mTCA&gbv=2
The problem I am experiencing is that opening that url in a browser I get everything I need to scrape but scraping the same link in the code, two (important) pieces are missing, the reviews number and the ratings, below the price and the seller info.
Here is the screenshot from the internal web client in c#: http://gyazo.com/908a37c7f70712fba1f82ec90a604d4d.png?1338822369
Here is the code with which I am trying to get the content:
public string navGet(string inURL, CookieContainer inCookieContainer, bool GZip, string proxyAddress, int proxyPort,string proxyUserName, string proxyPassword)
{
try
{
this.currentUrl = inURL;
HttpWebRequest webRequest = (HttpWebRequest)WebRequest.Create(inURL);
webRequest.Timeout = this.TimeOutSetting;
webRequest.CookieContainer = inCookieContainer;
if (proxyAddress == "0" || proxyPort == 0)
{ }
else
{
webRequest.Proxy = new WebProxy(proxyAddress, proxyPort);
// Use login credentials to access proxy
NetworkCredential networkCredential = new NetworkCredential(proxyUserName, proxyPassword);
webRequest.Proxy.Credentials = networkCredential;
}
Uri destination = webRequest.Address;
webRequest.KeepAlive = true;
webRequest.Method = "GET";
webRequest.Accept = "*/*";
webRequest.Headers.Add("Accept-Language", "en-us");
if (GZip)
{
webRequest.Headers.Add("Accept-Encoding", "gzip, deflate");
}
webRequest.AllowAutoRedirect = true;
webRequest.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; FunWebProducts; .NET CLR 1.1.4322; .NET CLR 2.0.50727)";
webRequest.ContentType = "text/xml";
//webRequest.CookieContainer.Add(inCookieContainer.GetCookies(destination));
try
{
string strSessionID = inCookieContainer.GetCookies(destination)["PHPSESSID"].Value;
webRequest.Headers.Add("Cookie", "USER_OK=1;PHPSESSID=" + strSessionID);
}
catch (Exception ex2)
{
}
HttpWebResponse webResponse = (HttpWebResponse)webRequest.GetResponse();
if (webRequest.HaveResponse)
{
// First handle cookies
foreach(Cookie retCookie in webResponse.Cookies)
{
bool cookieFound = false;
foreach(Cookie oldCookie in inCookieContainer.GetCookies(destination))
{
if (retCookie.Name.Equals(oldCookie.Name))
{
oldCookie.Value = retCookie.Value;
cookieFound = true;
}
}
if (!cookieFound)
inCookieContainer.Add(retCookie);
}
// Read response
Stream responseStream = responseStream = webResponse.GetResponseStream();
if (webResponse.ContentEncoding.ToLower().Contains("gzip"))
{
responseStream = new GZipStream(responseStream, CompressionMode.Decompress);
}
else if (webResponse.ContentEncoding.ToLower().Contains("deflate"))
{
responseStream = new DeflateStream(responseStream, CompressionMode.Decompress);
}
StreamReader stream = new StreamReader(responseStream, System.Text.Encoding.Default);
string responseString = stream.ReadToEnd();
stream.Close();
this.currentUrl = webResponse.ResponseUri.ToString();
this.currentAddress = webRequest.Address.ToString();
setViewState(responseString);
return responseString;
}
throw new Exception("No response received from host.");
return "An error was encountered";
}
catch(Exception ex)
{
//MessageBox.Show("NavGet:" + ex.Message);
return ex.Message;
}
}
Looks like it happens because the reviews number and the ratings are generated dynamically using Java Script (probably AJAX or something else). In this case you need to analyze additional traffic that takes place when the page is loaded in the browser and find where this data is transfered or analize JavaScript code to see how it's generated.