I am trying to make a web proxy. Here is what I have so far:
IPHostEntry IPHost = Dns.GetHostEntry(sURL);
Console.WriteLine("Resolved:{0}", IPHost.HostName);
string[] aliases = IPHost.Aliases;
IPAddress[] address = IPHost.AddressList;
Console.WriteLine(address[0]);
IPEndPoint sEndpoint = new IPEndPoint(address[0], 80);
Socket IPsocket = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
IPsocket.Connect(sEndpoint);
if (IPsocket.Connected)
{
Console.WriteLine("Socket OK");
}
NetworkStream ns = new NetworkStream(IPsocket);
StreamWriter sw = new StreamWriter(ns);
StreamReader sr = new StreamReader(ns);
for (int i = 0; i < lista.Count; i++)
{
sw.WriteLine(lista[i]);
Console.WriteLine(lista[i]);
}
sw.Flush();
string response = sr.ReadToEnd();</pre>
And how I read the request:
StreamReader sr = new StreamReader(s);
string plusz = "";
plusz = sr.ReadLine();
while (plusz != "")
{
lista.Add(plusz);
plusz = sr.ReadLine();
}
return lista;
The request looks like this:
GET http://google.com/ HTTP/1.1
Host: google.com
Proxy-Connection: keep-alive
Cache-Control: max-age=0
User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
Accept-Encoding: gzip,deflate,sdch
Accept-Language: hu-HU,hu;q=0.8,en-US;q=0.6,en;q=0.4
Accept-Charset: ISO-8859-2,utf-8;q=0.7,*;q=0.3
Cookie: rememberme=true; NID=54=l
(...)
pY
And as you can see I sent this exactly. The problem is that the program stops at the sr.ReadToEnd() method. It is just waiting for the data to arrive, but nothing happens. If I send a wrong request, then it works, so the browser displays the wrong request page (400).
namespace ProxyTester
{
class Program
{
static void Main(string[] args)
{
var htmlResponse = new StringBuilder();
var RequestPage = BuildHttpRequest("https://google.com/");
GetHttpResponse(RequestPage, htmlResponse);
}
public static HttpWebRequest BuildHttpRequest(string url)
{
try
{
var getPage = (HttpWebRequest)WebRequest.Create(url);
WebProxy proxyHTTP = new WebProxy("201.38.194.50", 53128);
getPage.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
getPage.UserAgent = "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2.19) Gecko/20110707 Firefox/3.6.19";
getPage.ProtocolVersion = HttpVersion.Version11;
getPage.Method = "GET";
getPage.Proxy = proxyHTTP;
getPage.Timeout = 10000;
getPage.ReadWriteTimeout = 10000;
return getPage;
}
catch (WebException ex)
{
Console.WriteLine(ex.ToString());
}
return null;
}
public static bool GetHttpResponse(HttpWebRequest page, StringBuilder htmlResponse)
{
htmlResponse.Length = 0;
try
{
Console.WriteLine("A");
// var pageResponse = page.GetResponse();
page.Timeout = 10000;
var pageResponse = (HttpWebResponse)page.GetResponse();
Console.WriteLine("5 minutes!");
if (pageResponse.StatusCode == HttpStatusCode.OK)
{
var reader = new StreamReader(pageResponse.GetResponseStream());
htmlResponse.Append(reader.ReadToEnd());
pageResponse.Close();
reader.Close();
return true;
}
Console.WriteLine(pageResponse.StatusCode.ToString());
pageResponse.Close();
return false;
}
catch (WebException ex)
{
Console.WriteLine(ex.ToString());
}
return false;
}
}
}
Related
I would like to get the HTML data from specific website ("https://www.justdial.com/Chennai/Silver-Oak-Service-Apartments-Next-to-Vivek-Showroom-Selaiyur/044PXX44-XX44-111215152228-W2G5_BZDET?xid=Q2hlbm5haSBIb3RlbHM=")
Interesting part is, I can able to get the data for other websites like (Google, StackOverFlow, etc)
I'm using below code, But I'm receiving "The operation has timed out" always.
public static string GetPageData(string url)
{
try
{
ServicePointManager.DefaultConnectionLimit = 7000;
ServicePointManager.Expect100Continue = true;
ServicePointManager.SecurityProtocol = SecurityProtocolType.Ssl3 | SecurityProtocolType.Tls | SecurityProtocolType.Tls11 | SecurityProtocolType.Tls12;
HttpWebRequest httpWebRequest = (HttpWebRequest)WebRequest.Create(url);
httpWebRequest.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36";
httpWebRequest.Headers.Add("Accept-Language", "en-US,fr-CA,it-IT;q=0.6");
httpWebRequest.ContentType = "text/html";
httpWebRequest.AllowAutoRedirect = true;
httpWebRequest.Method = "GET";
httpWebRequest.CookieContainer = new CookieContainer();
httpWebRequest.CookieContainer.Add(new Uri("https://justdial.com"), new CookieCollection());
httpWebRequest.KeepAlive = true;
httpWebRequest.MaximumAutomaticRedirections = 200;
httpWebRequest.Timeout = 7000;
using (var response = (HttpWebResponse)httpWebRequest.GetResponse())
{
Stream responseStream = response.GetResponseStream();
if (responseStream == null)
return string.Empty;
StreamReader streamReader = new StreamReader(responseStream);
string end = streamReader.ReadToEnd();
streamReader.Close();
streamReader.Dispose();
responseStream.Close();
responseStream.Dispose();
response.Close();
return end;
}
}
catch (Exception ex)
{
return ex.Message;
}
}
I am using a HttpWebRequest and a WebProxy to test some proxy servers to see if they are still functional or not. I set up the timeout to 1000 milliseconds but in some cases the program times out after more than 20 seconds. This happens in the catch loop. Is there a way to fix this? I don't want to wait more than 2 seconds for a proxy. I am using .Net 4.5.2
string GetPageThroughProxy(string sUrl, string sHost, string sProxyIPAddress, string sProxyPort, string sUsername, string sPassword)
{
Stopwatch stopwatch = Stopwatch.StartNew();
ServicePointManager.Expect100Continue = true;
ServicePointManager.SecurityProtocol = SecurityProtocolType.Tls | SecurityProtocolType.Tls11 | SecurityProtocolType.Tls12 | SecurityProtocolType.Ssl3;
HttpWebRequest httpWebRequest=(HttpWebRequest)WebRequest.Create(sUrl);
string proxyAddress = sProxyIPAddress;
string proxyPort = sProxyPort;
string username = sUsername;
string password = sPassword;
WebProxy wp = new WebProxy(proxyAddress + ":" + proxyPort);
wp.UseDefaultCredentials = false;
wp.Credentials = new NetworkCredential(username, password);
httpWebRequest.Proxy = wp;
string sReferer = sUrl;
httpWebRequest.ProtocolVersion = HttpVersion.Version11;
httpWebRequest.UserAgent = Variables.userAgent; // "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36 Edg/84.0.522.40";
httpWebRequest.Accept = Variables.reqAccept; // "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9";
httpWebRequest.Headers["Accept-Encoding"] = "gzip, deflate, br";
httpWebRequest.AutomaticDecompression = DecompressionMethods.GZip;
httpWebRequest.Headers["Accept-Language"] = "en-US,en;q=0.9";
httpWebRequest.MaximumAutomaticRedirections = 2;
httpWebRequest.Method = "GET";
httpWebRequest.Referer = sReferer;
httpWebRequest.Timeout = 1000;
httpWebRequest.ReadWriteTimeout = 1000;
try {
WebResponse response = httpWebRequest.GetResponse();
string responseFromServer = string.Empty;
stopwatch.Stop();
MessageBox.Show(stopwatch.Elapsed.ToString("mm\\:ss\\.ff"));
using (Stream dataStream = response.GetResponseStream()) {
StreamReader reader = new StreamReader(dataStream);
responseFromServer = reader.ReadToEnd();
}
response.Close();
return responseFromServer;
}
catch(Exception ex) {
MessageBox.Show("Exception " + ex.Message + "\n\n" + stopwatch.Elapsed.ToString("mm\\:ss\\.ff"));
}
return string.Empty;
}
I want to try get current DateTime like this :
try {
HttpWebRequest req = (HttpWebRequest) WebRequest.Create("http://www.timestampconvert.com/");
req.Method = "GET";
req.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
req.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36";
req.ContentType = "text/html; charset=utf-8";
req.Referer = string.Empty;
req.KeepAlive = true;
req.Timeout = 25000;
//req.Proxy = proxy;
HttpWebResponse res = (HttpWebResponse) req.GetResponse();
Stream Stream = res.GetResponseStream();
StreamReader reader = new StreamReader(Stream);
string reader_str = reader.ReadToEnd();
HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument();
htmlDoc.LoadHtml(reader_str);
var divs = from div in htmlDoc.DocumentNode.Descendants("div")
select div;
DateTime dt = DateTime.Now;
foreach(var div in divs) {
if (div.Attributes["class"] != null) {
if (div.Attributes["class"].Value == "remarkrow") {
if (div.InnerText.Contains("Computation based on input date")) {
CultureInfo cultureinfo = new CultureInfo("en-US");
dt = Convert.ToDateTime(div.InnerText.Replace("\n\t*)Computation based on input date ", string.Empty), cultureinfo);
dt = dt.AddHours(2).AddMinutes(30);
break;
}
}
}
}
DateTime dt_ = dt;
}
catch(Exception ex) {
MessageBox.Show(ex.ToString());
}
But it has an exception like below :
The request was aborted: Could not create SSL/TLS secure channel
How can i fix this error?
I am trying to make web scraper in C# for NSE. The code works with other sites but when ran on https://www.nseindia.com/ it gives error - An error occurred while sending the request. Unable to read data from the transport connection: Operation timed out.
I have tried with two different approaches Try1() & Try2().
Can anyone please tell what I am missing in my code?
class Program
{
public void Try1() {
HtmlWeb web = new HtmlWeb();
HttpStatusCode statusCode = HttpStatusCode.OK;
web.UserAgent = GetUserAgent();
web.PostResponse = (request, response) =>
{
if (response != null)
{
statusCode = response.StatusCode;
Console.WriteLine("Status Code: " + statusCode);
}
};
Task<HtmlDocument> task = web.LoadFromWebAsync(GetURL());
HtmlDocument document = task.Result;
}
public void Try2() {
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(GetURL());
request.UserAgent = GetUserAgent();
request.Accept= "*/*;";
using (var response = (HttpWebResponse)(request.GetResponse()))
{
HttpStatusCode code = response.StatusCode;
if (code == HttpStatusCode.OK)
{
using (StreamReader streamReader = new StreamReader(response.GetResponseStream(), Encoding.UTF8))
{
HtmlDocument htmlDoc = new HtmlDocument();
htmlDoc.OptionFixNestedTags = true;
htmlDoc.Load(streamReader);
Console.WriteLine("Document Loaded.");
}
}
}
}
private string GetURL() {
// return "https://html-agility-pack.net/";
return "https://www.nseindia.com/";
}
private string GetUserAgent() {
return "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36";
}
}
Your are lack of headers towards Accept and others so it couldn't response back.
Besides that, I would recommend you using HttpClient instead of HttpWebRequest
public static async Task GetHtmlData(string url)
{
HttpClient httpClient = new HttpClient();
using (var request = new HttpRequestMessage(HttpMethod.Get, new Uri(url)))
{
request.Headers.TryAddWithoutValidation("Accept", "text/html,application/xhtml+xml,application/xml, charset=UTF-8, text/javascript, */*; q=0.01");
request.Headers.TryAddWithoutValidation("Accept-Encoding", "gzip, deflate, br");
request.Headers.TryAddWithoutValidation("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36 OPR/67.0.3575.137");
request.Headers.TryAddWithoutValidation("Accept-Charset", "ISO-8859-1");
request.Headers.TryAddWithoutValidation("X-Requested-With", "XMLHttpRequest");
using (var response = await httpClient.SendAsync(request).ConfigureAwait(false))
{
response.EnsureSuccessStatusCode();
using (var responseStream = await response.Content.ReadAsStreamAsync().ConfigureAwait(false))
using (var decompressedStream = new GZipStream(responseStream, CompressionMode.Decompress))
using (var streamReader = new StreamReader(decompressedStream))
{
var result = await streamReader.ReadToEndAsync().ConfigureAwait(false);
HtmlDocument htmlDoc = new HtmlDocument();
htmlDoc.OptionFixNestedTags = true;
htmlDoc.LoadHtml(result);
Console.WriteLine(result);
Console.WriteLine("Document Loaded.");
}
}
}
Use it by
await GetHtmlData("https://www.nseindia.com/");
I have written some code to check to see if all websites in my database are still hosted and online.
The problem is some of these sites seem to have bot protection and whenever I try to request then via HttpClient they raise an error instead of displaying the page.
I have seen other similar questions that suggest to add in browser headers so I have done this but this does not help. The same sites still reject the HttpClient connection but are perfectly fine when I view them in the browser.
Have I done something wrong with my code or do I need some additional steps?
Here is my code:
public static async Task CheckSite(string url, int id)
{
try
{
using(var db = new PlaceDBContext())
using (HttpClient client = new HttpClient(new HttpClientHandler()
{
AutomaticDecompression = DecompressionMethods.Deflate | DecompressionMethods.GZip
}))
using (HttpResponseMessage response = await client.GetAsync(url))
using (HttpContent content = response.Content)
{
client.DefaultRequestHeaders.TryAddWithoutValidation("Accept", "text/html,application/xhtml+xml,application/xml");
client.DefaultRequestHeaders.TryAddWithoutValidation("Accept-Encoding", "gzip, deflate");
client.DefaultRequestHeaders.TryAddWithoutValidation("User-Agent", "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:19.0) Gecko/20100101 Firefox/19.0");
client.DefaultRequestHeaders.TryAddWithoutValidation("Accept-Charset", "ISO-8859-1");
var rd = db.RootDomains.Find(id);
string result = await content.ReadAsStringAsync();
if (result != null && result.Length >= 50)
{
Console.WriteLine("fine");
rd.LastCheckOnline = true;
}
else
{
Console.WriteLine("There was empty or short result");
rd.LastCheckOnline = false;
}
db.SaveChanges();
semaphore.Release();
}
}
catch(Exception ex)
{
Console.WriteLine(ex.Message);
using(var db = new PlaceDBContext())
{
var rd = db.RootDomains.Find(id);
rd.LastCheckOnline = false;
db.SaveChanges();
semaphore.Release();
}
}
}
Set the headers before sending the request. You are doing them after already getting a response
public static async Task CheckSite(string url, int id) {
try {
using (var db = new PlaceDBContext())
using (var client = new HttpClient(new HttpClientHandler() {
AutomaticDecompression = DecompressionMethods.Deflate | DecompressionMethods.GZip
})) {
client.DefaultRequestHeaders.TryAddWithoutValidation("Accept-Encoding", "gzip, deflate");
client.DefaultRequestHeaders.TryAddWithoutValidation("User-Agent", "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:19.0) Gecko/20100101 Firefox/19.0");
client.DefaultRequestHeaders.TryAddWithoutValidation("Accept-Charset", "ISO-8859-1");
using (var response = await client.GetAsync(url))
using (var content = response.Content) {
client.DefaultRequestHeaders.TryAddWithoutValidation("Accept", "text/html,application/xhtml+xml,application/xml");
var rd = db.RootDomains.Find(id);
string result = await content.ReadAsStringAsync();
if (result != null && result.Length >= 50) {
Console.WriteLine("fine");
rd.LastCheckOnline = true;
} else {
Console.WriteLine("There was empty or short result");
rd.LastCheckOnline = false;
}
db.SaveChanges();
semaphore.Release();
}
}
} catch (Exception ex) {
Console.WriteLine(ex.Message);
using (var db = new PlaceDBContext()) {
var rd = db.RootDomains.Find(id);
rd.LastCheckOnline = false;
db.SaveChanges();
semaphore.Release();
}
}
}