I am trying to scrape content from this page: https://www.google.com/search?hl=en&biw=1920&bih=956&tbm=shop&q=Xenon+12640&oq=Xenon+12640&aq=f&gs_l=serp.3...3743.3743.0.3905.1.1.0.0.0.0.0.0..0.0.ekh..0.0.Hq3XS7AxFDU&sei=Dr_MT_WOM6nO2AWE25mTCA&gbv=2
The problem I am experiencing is that opening that url in a browser I get everything I need to scrape but scraping the same link in the code, two (important) pieces are missing, the reviews number and the ratings, below the price and the seller info.
Here is the screenshot from the internal web client in c#: http://gyazo.com/908a37c7f70712fba1f82ec90a604d4d.png?1338822369
Here is the code with which I am trying to get the content:
public string navGet(string inURL, CookieContainer inCookieContainer, bool GZip, string proxyAddress, int proxyPort,string proxyUserName, string proxyPassword)
{
try
{
this.currentUrl = inURL;
HttpWebRequest webRequest = (HttpWebRequest)WebRequest.Create(inURL);
webRequest.Timeout = this.TimeOutSetting;
webRequest.CookieContainer = inCookieContainer;
if (proxyAddress == "0" || proxyPort == 0)
{ }
else
{
webRequest.Proxy = new WebProxy(proxyAddress, proxyPort);
// Use login credentials to access proxy
NetworkCredential networkCredential = new NetworkCredential(proxyUserName, proxyPassword);
webRequest.Proxy.Credentials = networkCredential;
}
Uri destination = webRequest.Address;
webRequest.KeepAlive = true;
webRequest.Method = "GET";
webRequest.Accept = "*/*";
webRequest.Headers.Add("Accept-Language", "en-us");
if (GZip)
{
webRequest.Headers.Add("Accept-Encoding", "gzip, deflate");
}
webRequest.AllowAutoRedirect = true;
webRequest.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; FunWebProducts; .NET CLR 1.1.4322; .NET CLR 2.0.50727)";
webRequest.ContentType = "text/xml";
//webRequest.CookieContainer.Add(inCookieContainer.GetCookies(destination));
try
{
string strSessionID = inCookieContainer.GetCookies(destination)["PHPSESSID"].Value;
webRequest.Headers.Add("Cookie", "USER_OK=1;PHPSESSID=" + strSessionID);
}
catch (Exception ex2)
{
}
HttpWebResponse webResponse = (HttpWebResponse)webRequest.GetResponse();
if (webRequest.HaveResponse)
{
// First handle cookies
foreach(Cookie retCookie in webResponse.Cookies)
{
bool cookieFound = false;
foreach(Cookie oldCookie in inCookieContainer.GetCookies(destination))
{
if (retCookie.Name.Equals(oldCookie.Name))
{
oldCookie.Value = retCookie.Value;
cookieFound = true;
}
}
if (!cookieFound)
inCookieContainer.Add(retCookie);
}
// Read response
Stream responseStream = responseStream = webResponse.GetResponseStream();
if (webResponse.ContentEncoding.ToLower().Contains("gzip"))
{
responseStream = new GZipStream(responseStream, CompressionMode.Decompress);
}
else if (webResponse.ContentEncoding.ToLower().Contains("deflate"))
{
responseStream = new DeflateStream(responseStream, CompressionMode.Decompress);
}
StreamReader stream = new StreamReader(responseStream, System.Text.Encoding.Default);
string responseString = stream.ReadToEnd();
stream.Close();
this.currentUrl = webResponse.ResponseUri.ToString();
this.currentAddress = webRequest.Address.ToString();
setViewState(responseString);
return responseString;
}
throw new Exception("No response received from host.");
return "An error was encountered";
}
catch(Exception ex)
{
//MessageBox.Show("NavGet:" + ex.Message);
return ex.Message;
}
}
Looks like it happens because the reviews number and the ratings are generated dynamically using Java Script (probably AJAX or something else). In this case you need to analyze additional traffic that takes place when the page is loaded in the browser and find where this data is transfered or analize JavaScript code to see how it's generated.
Related
I have a little problem with cookie handling in C#
So on my web site, I have a login page, once logged in, I am redirected to the home page. I get with HttpWebRequest to connect and follow the redirection, I created a class, here it is :
class webReq
{
private string urlConnection;
private string login;
private string password;
private CookieCollection cookieContainer;
private long executionTime = 0;
public webReq(string urlCo, string login, string pass)
{
this.urlConnection = urlCo;
this.login = login;
this.password = pass;
this.cookieContainer = null;
}
public void StartConnection()
{
string WriteHTML = "D:/REM/Connection.html";
List<string> datas = new List<string>();
datas.Add("Username=" + this.login);
datas.Add("Password=" + this.password);
datas.Add("func=ll.login");
datas.Add("NextURL=/admin/livelink.exe");
datas.Add("loginbutton=Sign in");
string postData = "";
postData = string.Join("&", datas);
var buffer = Encoding.ASCII.GetBytes(postData);
try
{
var watch = System.Diagnostics.Stopwatch.StartNew();
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(this.urlConnection);
request.AllowAutoRedirect = true;
request.Method = "POST";
request.UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1003.1 Safari/535.19";
request.Accept = "text/html, application/xhtml+xml, */*";
request.ContentType = "application/x-www-form-urlencoded";
request.ContentLength = buffer.Length;
request.CookieContainer = new CookieContainer();
Stream stream = request.GetRequestStream();
stream.Write(buffer, 0, buffer.Length);
stream.Close();
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
stream = response.GetResponseStream();
watch.Stop();
this.executionTime = watch.ElapsedMilliseconds;
StreamReader reader = new StreamReader(stream);
System.IO.File.WriteAllText(WriteHTML, reader.ReadToEnd());
this.cookieContainer = new CookieCollection();
foreach (Cookie cookie in response.Cookies)
{
this.cookieContainer.Add(cookie);
}
}
catch (WebException ex)
{
Console.WriteLine(ex.GetBaseException().ToString());
}
}
}
I load the home page well, and I manage to get a cookie.
So I developed a function to use my cookie to browse the website :
public void connectUrl(string url, int numeroTest)
{
string WriteHTML = "D:/REM/Page"+numeroTest+".html";
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
//Add cookie to request.CookieContainer
request.CookieContainer = new CookieContainer();
request.CookieContainer.Add(this.cookieContainer);
var watch = System.Diagnostics.Stopwatch.StartNew();
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream stream = response.GetResponseStream();
watch.Stop();
this.executionTime = watch.ElapsedMilliseconds;
StreamReader reader = new StreamReader(stream);
System.IO.File.WriteAllText(WriteHTML, reader.ReadToEnd());
}
Normally, I have to retrieve three cookies, like on the website :
Only, I can't navigate on the website, I end up on the login page, the cookies are not good, and that I'm in debug, I only loaded one cookie(BrowseSettings) out of the three(LLCookie & LLTZCookie) :
I don't understand why I can't retrieve all the cookies on the website.... If anyone has a solution!
I found the reason why I can't get all the cookies, even if I can't find exactly why it works by disabling redirection, in my StartConnection() method :
request.AllowAutoRedirect = true;
I am trying to perform a GET request to https://sede.educacion.gob.es/publiventa/catalogo.action?cod=E; with the cod=E parameter, in the browser, the web site open a menu below "Materias de educación", but when I perform the request using C# this menu is not loading and I need it. This is the code I am using to readHtml as string to later parse it with HtmlAgilityPack.
private string readHtml(string urlAddress)
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(urlAddress);
request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0";
request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
request.AutomaticDecompression = DecompressionMethods.GZip;
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
if (response.StatusCode == HttpStatusCode.OK)
{
Stream receiveStream = response.GetResponseStream();
StreamReader readStream = null;
if (response.CharacterSet == null)
{
readStream = new StreamReader(receiveStream);
}
else
{
readStream = new StreamReader(receiveStream, Encoding.GetEncoding(response.CharacterSet));
}
string data = readStream.ReadToEnd();
response.Close();
readStream.Close();
return data;
}
return null;
}
The Uri you posted (https://sede.educacion.gob.es/publiventa/catalogo.action?cod=E) uses a Javascript switch to show the Menu content.
When you connect to that Uri (without clicking a menu link), that site shows three different versions of that page.
1) Page with closed menu and proposed new editions
2) Page with closed menu and search engine fields
3) Page with open menu and a selection of the menu content
This switch is based on a internal procedure which records the current session. Unless you click on a menu link (which is connected to an event listener), the Javascript proc shows the page in different states.
I gave it a look; those script are quite long (a whole multi-purpose library) and I had no time to parse it all (may be you can do that) to find out what parameters the event listener is passing.
But, the three-state version switch is constant.
What I mean is you can call that page three times, preserving the Cookie Container: the third time you connect to it, it will stream the whole menu content and its links.
If you request three times the same page, the third time the Html page will
contain all theMaterias de educación links
public async void SomeMethodAsync()
{
string HtmlPage = await GetHttpStream([URI]);
HtmlPage = await GetHttpStream([URI]);
HtmlPage = await GetHttpStream([URI]);
}
This is, more or less, what I used to get that page:
CookieContainer CookieJar = new CookieContainer();
public async Task<string> GetHttpStream(Uri HtmlPage)
{
HttpWebRequest httpRequest;
string Payload = string.Empty;
httpRequest = WebRequest.CreateHttp(HtmlPage);
try
{
httpRequest.CookieContainer = CookieJar;
httpRequest.KeepAlive = true;
httpRequest.ConnectionGroupName = Guid.NewGuid().ToString();
httpRequest.AllowAutoRedirect = true;
httpRequest.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate;
httpRequest.ServicePoint.MaxIdleTime = 30000;
httpRequest.ServicePoint.Expect100Continue = false;
httpRequest.UserAgent = "Mozilla/5.0 (Windows NT 10; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0";
httpRequest.Accept = "ext/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
httpRequest.Headers.Add(HttpRequestHeader.AcceptLanguage, "es-ES,es;q=0.8,en-US;q=0.5,en;q=0.3");
httpRequest.Headers.Add(HttpRequestHeader.AcceptEncoding, "gzip, deflate;q=0.8");
httpRequest.Headers.Add(HttpRequestHeader.CacheControl, "no-cache");
using (HttpWebResponse httpResponse = (HttpWebResponse)await httpRequest.GetResponseAsync())
{
Stream ResponseStream = httpResponse.GetResponseStream();
if (httpResponse.StatusCode == HttpStatusCode.OK)
{
try
{
//ResponseStream.Position = 0;
Encoding encoding = Encoding.GetEncoding(httpResponse.CharacterSet);
using (MemoryStream _memStream = new MemoryStream())
{
if (httpResponse.ContentEncoding.Contains("gzip"))
{
using (GZipStream _gzipStream = new GZipStream(ResponseStream, System.IO.Compression.CompressionMode.Decompress))
{
_gzipStream.CopyTo(_memStream);
};
}
else if (httpResponse.ContentEncoding.Contains("deflate"))
{
using (DeflateStream _deflStream = new DeflateStream(ResponseStream, System.IO.Compression.CompressionMode.Decompress))
{
_deflStream.CopyTo(_memStream);
};
}
else
{
ResponseStream.CopyTo(_memStream);
}
_memStream.Position = 0;
using (StreamReader _reader = new StreamReader(_memStream, encoding))
{
Payload = _reader.ReadToEnd().Trim();
};
};
}
catch (Exception)
{
Payload = string.Empty;
}
}
}
}
catch (WebException exW)
{
if (exW.Response != null)
{
//Handle WebException
}
}
catch (System.Exception exS)
{
//Handle System.Exception
}
CookieJar = httpRequest.CookieContainer;
return Payload;
}
I'm trying to connect to the Lighthouse api using C# code. This is the php example https://support.lighthouserocks.com/hc/en-gb/articles/201319732-API-The-Basics which it describe how to do this. But I'm fails with it. I was try both NetworkCredentials & send in the Header but still have 401 Unauthorized access to it, here is the code:
public string RequestResponse()
{
HttpWebRequest webRequest = WebRequest.Create(HomeUrl) as HttpWebRequest;
webRequest.Method = "GET";
webRequest.ContentType = "application/json";
webRequest.ServicePoint.Expect100Continue = false;
webRequest.Timeout = 20000;
string auth = CreateAuthorization("domain.lhlive.com", "user", "token");
webRequest.Headers["auth"] = "Basic " + auth;
//webRequest.Credentials = new NetworkCredential("user", "token");
//webRequest.PreAuthenticate = true;
//webRequest.Headers.Add("auth", "user, token");
webRequest.Accept = "application/vnd.lighthouse.v1.hal+json";
Stream responseStream = null;
StreamReader responseReader = null;
string responseData = "";
try
{
WebResponse webResponse = webRequest.GetResponse();
responseStream = webResponse.GetResponseStream();
responseReader = new StreamReader(responseStream);
responseData = responseReader.ReadToEnd();
}
finally
{
if (responseStream != null)
{
responseStream.Close();
responseReader.Close();
}
}
return responseData;
}
public void Test()
{
using (var client = new WebClient())
{
client.Headers["User-Agent"] = "Mozilla/4.0 (Compatible; Windows NT 5.1; MSIE 6.0) (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)";
client.Headers["ContentType"] = "application/json";
client.Headers["Accept"] = "application/vnd.lighthouse.v1.hal+json";
//client.Headers["Lighthouse Username"] = "user";
//client.Headers["API Key"] = "token";
client.Headers["WWW-Authenticate"] = "user, token";
byte[] arr = client.DownloadData("https://domain.lhlive.com/contacts");
Console.WriteLine("--- WebClient result ---");
Console.WriteLine(arr.Length);
}
}
anybody know what I should to do?
Hard to say because I can't access Lighthouse but try the following (notice how auth header is set).
var webRequest = WebRequest.Create("http://some.endpoint.com/") as HttpWebRequest;
webRequest.Method = "GET";
webRequest.Accept = "application/vnd.lighthouse.v1.hal+json";
webRequest.ContentType = "application/json";
webRequest.Headers["Authorization"] = string.Format("Basic {0}",
Convert.ToBase64String(Encoding.Default.GetBytes(
string.Format("{0}:{1}", "your username", "your API key"))));
var response = webRequest.GetResponse();
var stream = response.GetResponseStream();
var data = (new StreamReader(stream)).ReadToEnd();
In this case your Authorization header looks like "Basic eW91ciB1c2VybmFtZTp5b3VyIEFQSSBrZXk=".
I got a problem on posting data by using HttpWebRequest.
There is a string(ie. key1=value1&key2=value2&key3=value3) and I have post it to a site (ie. www.*.com/edit), but ,I don't know why that sometimes it's nothing wrong , but sometimes ,the first key=value1 will be missing, only key2=value&key3=value3 that can find in HttpAnalyzer.
public static string SubmitData(string Url, string FormData, CookieContainer _Cc, string ContentType)
{
Stream RequestStream = null, ResponseStream = null; StreamReader Sr = null;
HttpWebRequest HRequest = (HttpWebRequest)WebRequest.Create(Url);
try
{
HRequest.CookieContainer = _Cc;
HRequest.Method = "POST";
HRequest.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0)";
HRequest.ContentType = ContentType;
HRequest.ContentLength = FormData.Length;
//byte[] BFromData = new ASCIIEncoding().GetBytes(FormData);
byte[] BFromData = Encoding.ASCII.GetBytes(FormData);
BFromData = Encoding.Convert(Encoding.ASCII, Encoding.UTF8, BFromData);//ascii → utf8
RequestStream = HRequest.GetRequestStream();
RequestStream.Write(BFromData, 0, BFromData.Length);
//RequestStream.Write(utf8Bytes,0,utf8Bytes.Length );
HttpWebResponse HResponse = (HttpWebResponse)HRequest.GetResponse();
ResponseStream = HResponse.GetResponseStream();
Sr = new StreamReader(ResponseStream, Encoding.UTF8);
return Sr.ReadToEnd();
}
catch
{
return "";
}
finally
{
if (null != RequestStream) RequestStream.Close();
if (null != ResponseStream) ResponseStream.Close();
if (null != Sr) Sr.Close();
}
}
Use Fiddler to see how the request looks like when you click on the form then try using this approach and modify what you need for your request.
public static void PostDataAndDoSomething()
{
string URI = "http://www.something.com";
//make your request payload
string requestBody = String.Format("{{'param1': {0}, 'param2': {1}}}",value1, value2); //json format
HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(URI); //make request
// set request headers as you need
request.ContentType = "application/json; charset=UTF-8";
request.Accept = "application/json, text/javascript;
request.Method = "POST";
request.UserAgent = "";
request.Headers.Add("X-Requested-With", "XMLHttpRequest");
using (StreamWriter writer = new StreamWriter(request.GetRequestStream()))
{
writer.Write(requestBody); //write your request payload
}
WebResponse response = request.GetResponse();
string jsonData = String.Empty;
using (var reader = new StreamReader(response.GetResponseStream()))
{
jsonData = reader.ReadToEnd();
}
response.Close();
//do something with your data, deserialize, Regex etc....
}
i have a problem in certain company in germany. They use proxy in their network and my program cant communicate with server.
IE works with this settings:
It means:
Automatically detect settings
This is the code:
public static bool CompleteValidation(string regKey)
{
string uri = "***";
int c = 1;
if (Counter < 5) c = 6 - Counter;
string response = "";
try
{
System.Net.ServicePointManager.Expect100Continue = false;
HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(uri);
request.AllowWriteStreamBuffering = true;
request.Method = "POST";
request.UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:5.0) Gecko/20100101 Firefox/5.0";
request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
request.Headers.Add(HttpRequestHeader.AcceptLanguage, "pl,en-us;q=0.7,en;q=0.3");
request.Headers.Add(HttpRequestHeader.AcceptEncoding, "gzip, deflate");
request.Headers.Add(HttpRequestHeader.AcceptCharset, "ISO-8859-2,utf-8;q=0.7,*;q=0.7");
request.KeepAlive = true;
//proxy settings
string exepath = Path.GetDirectoryName(Application.ExecutablePath);
string proxySettings = exepath + #"\proxy.ini";
WebProxy wp = new WebProxy();
if (File.Exists(proxySettings)) {
request.Proxy = WebRequest.DefaultWebProxy;
IniFile ini = new IniFile(proxySettings);
string user = ini.IniReadValue("Proxy", "User");
string pass = ini.IniReadValue("Proxy", "Password");
string domain = ini.IniReadValue("Proxy", "Domain");
string ip = ini.IniReadValue("Proxy", "IP");
string port_s = ini.IniReadValue("Proxy", "Port");
int port = 0;
if (!string.IsNullOrEmpty(ip))
{
if (!string.IsNullOrEmpty(port_s))
{
try
{
port = Convert.ToInt32(port_s);
}
catch (Exception e)
{
ErrorLog.AddToLog("Problem with conversion of port:");
ErrorLog.AddToLog(e.Message);
ErrorLog.ShowLogWindow();
}
wp = new WebProxy(ip, port);
} else {
wp = new WebProxy(ip);
}
}
if (string.IsNullOrEmpty(domain))
wp.Credentials = new NetworkCredential(user, pass);
else
wp.Credentials = new NetworkCredential(user, pass, domain);
request.Proxy = wp;
}
string post = "***";
request.ContentLength = post.Length;
request.ContentType = "application/x-www-form-urlencoded";
StreamWriter writer = null;
try
{
writer = new StreamWriter(request.GetRequestStream()); // Here is the WebException thrown
writer.Write(post);
writer.Close();
}
catch (Exception e)
{
ErrorLog.AddToLog("Problem with request sending:");
ErrorLog.AddToLog(e.Message);
ErrorLog.ShowLogWindow();
}
HttpWebResponse Response = null;
try
{
Response = (HttpWebResponse)request.GetResponse();
}
catch (Exception e)
{
ErrorLog.AddToLog("Problem with response:");
ErrorLog.AddToLog(e.Message);
ErrorLog.ShowLogWindow();
}
//Request.Proxy = WebProxy.GetDefaultProxy();
//Request.Proxy.Credentials = CredentialCache.DefaultCredentials;
string sResponseHeader = Response.ContentEncoding; // get response header
if (!string.IsNullOrEmpty(sResponseHeader))
{
if (sResponseHeader.ToLower().Contains("gzip"))
{
byte[] b = DecompressGzip(Response.GetResponseStream());
response = System.Text.Encoding.GetEncoding(Response.ContentEncoding).GetString(b);
}
else if (sResponseHeader.ToLower().Contains("deflate"))
{
byte[] b = DecompressDeflate(Response.GetResponseStream());
response = System.Text.Encoding.GetEncoding(Response.ContentEncoding).GetString(b);
}
}
// uncompressed, standard response
else
{
StreamReader ResponseReader = new StreamReader(Response.GetResponseStream());
response = ResponseReader.ReadToEnd();
ResponseReader.Close();
}
}
catch (Exception e)
{
ErrorLog.AddToLog("Problem with comunication:");
ErrorLog.AddToLog(e.Message);
ErrorLog.ShowLogWindow();
}
if (response == "***")
{
SaveKeyFiles();
WriteRegKey(regKey);
RenewCounter();
return true;
}
else
{
return false;
}
}
My program logs it as:
[09:13:18] Searching for hardware ID
[09:13:56] Problem with response:
[09:13:56] The remote server returned an error: (407) Proxy Authentication Required.
[09:15:04] problem with comunication:
[09:15:04] Object reference not set to an object instance.
If they write user and pass into proxy.ini file, program works. But the problem is they cant do that. And somehow IE works without it. Is there any way to get those settings from IE or system?
Use GetSystemWebProxy to return what the system default proxy is.
WebRequest.DefaultProxy = WebRequest.GetSystemWebProxy();
But every HttpWebRequest should automatically be filled out with this information by default. For example, the following snippet in a standalone console application should print the correct information on a system with a PAC file configured.
HttpWebRequest myWebRequest=(HttpWebRequest)WebRequest.Create("http://www.microsoft.com");
// Obtain the 'Proxy' of the Default browser.
IWebProxy proxy = myWebRequest.Proxy;
// Print the Proxy Url to the console.
if (proxy != null)
{
Console.WriteLine("Proxy: {0}", proxy.GetProxy(myWebRequest.RequestUri));
}
else
{
Console.WriteLine("Proxy is null; no proxy will be used");
}
Use DefaultNetworkCredentials to return system proxy credentials.
request.Proxy.Credentials = System.Net.CredentialCache.DefaultNetworkCredentials;