I'm having a problem with a basic piece of code:
var objReader = new StreamReader(ofDialog.FileName);
while (objReader.Peek() >= 0)
{
Helpers.returnMessage(objReader.ReadLine());
// trim the url to root
var uri = new Uri(objReader.ReadLine());
var host = uri.Host;
}
I'm loading a .txt file of urls, the trying to trim to root using uri.host but i'm getting the error:
Value cannot be null.Parameter name: UriString
But if i hard code a url in: var uri = new Uri("https://stackoverflow.com/questions/ask");
It works fine, it seems to be when i'm loading from a .txt file.
any help would be appreciated.
Complete code:
private void btnInput_Click(object sender, EventArgs e)
{
// ofDialog settings
ofDialog.Filter = #"TXT Files|*.txt";
ofDialog.Title = #"Select your source backlink file...";
ofDialog.FileName = "URLs.txt";
// is cancel pressed?
if (ofDialog.ShowDialog() == DialogResult.Cancel)
return;
try
{
var objReader = new StreamReader(ofDialog.FileName);
while (objReader.Peek() >= 0)
{
//Helpers.returnMessage(objReader.ReadLine());
// trim the url to root
var x = objReader.ReadLine();
Helpers.returnMessage(x);
// trim the url to root
var uri = new Uri(x);
var host = uri.Host;
//Helpers.returnMessage(host);
// extract urls here
var wc = new WebClient();
var html = wc.DownloadString(objReader.ReadLine());
// 1. Find all matches in file
var m1 = Regex.Matches(html, #"(<a.*?>.*?</a>)",
RegexOptions.Singleline);
// 2. Loop over each match
foreach (Match m in m1)
{
var value = m.Groups[1].Value;
string href;
// 3. Get href attribute
var m2 = Regex.Match(value, #"href=\""(.*?)\""",
RegexOptions.Singleline);
if (m2.Success)
{
href = m2.Groups[1].Value;
}
else
{
continue;
}
// add to the results
if (href.StartsWith("http"))
{
//Helpers.returnMessage(href);
if (!href.Contains(host))
{
// add urls to the listview
var lvi = new ListViewItem(href);
listViewMain.Items.Add(lvi);
}
}
}
}
}
catch (Exception ex)
{
Helpers.returnMessage(ex.Message);
}
}
returnMessage() literally just returns a message box popup.
Try this:
var objReader = new StreamReader(ofDialog.FileName);
while (objReader.Peek() >= 0)
{
string x = objReader.ReadLine();
Helpers.returnMessage(x);
// trim the url to root
var uri = new Uri(x);
var host = uri.Host;
}
Related
static void Main(string[] args) {
var file = File.Open(Directory.GetCurrentDirectory() + "/Net.pdf", FileMode.Open);
var pattern = new Regex("kullan", RegexOptions.IgnoreCase);
//this line is not working
TextExtractor textExtractor = new TextExtractor();
var dddd = ReadToEnd(file);
var textStrings = textExtractor.Extract(dddd);
var matches = pattern.Matches(textStrings.Text);
foreach (var item in matches)
{
Console.WriteLine(item);
}
}
You can try something like this:
File file = new File(myPDF);
//pattern
var pattern = new Regex("kullan", RegexOptions.IgnoreCase);
var textExtractor = new TextExtractor();
foreach (var page in file.Document.Pages)
{
var strings = textExtractor.Extract(page);
var matchingText = pattern.Matches(TextExtractor.ToString(strings));
}
Tika:
try
{
var result = new TextExtractor().Extract(yourPDF.pdf).Text;
Console.WriteLine(result.Text.Length);
foreach(var line in result)
{
if(line.contains("kullen"))
/** Do Something **/
}
}
catch(Exception e)
{
Console.WriteLine("Error occurred: " + e);
}
I did it like that, thank you your answer
namespace pro
{
class Program
{
static void Main(string[] args)
{
string b = pdfText(Directory.GetCurrentDirectory()+ "/Net.pdf");
string a= "kullan";
int sonuc;
sonuc = b.IndexOf(a,0, b.Length);
if(sonuc==-1)
{
Console.WriteLine("not found");
}
else
{
Console.WriteLine("found from " + sonuc.ToString() + ". character");
}
}
public static string pdfText(string path)
{
PdfReader reader = new PdfReader(path);
var dd = reader.GetPageContent(1);
string text = string.Empty;
for (int page = 1; page <= reader.NumberOfPages; page++)
{
System.Text.Encoding.RegisterProvider(System.Text.CodePagesEncodingProvider.Instance);
text += PdfTextExtractor.GetTextFromPage(reader, page);
}
reader.Close();
return text;
}
}
}
I Want to develope an app I give Url of a specific website to it,and it extract all links from that Web page. For each extracted link I want to get the HTML content. I am based in the concept of deep crawling.
My purpose is to get all email addresses of website. Below is my source code:
static string ExtractEmails(string data)
{
//instantiate with this pattern
Regex emailRegex = new Regex(#"\w+([-+.]\w+)*#\w+([-.]\w+)*\.\w+([-.]\w+)*", RegexOptions.IgnoreCase);
//find items that matches with our pattern
MatchCollection emailMatches = emailRegex.Matches(data);
//StringBuilder sb = new StringBuilder();
string s = "";
foreach (Match emailMatch in emailMatches)
{
//sb.AppendLine(emailMatch.Value);
s += emailMatch.Value + ",";
}
return s;
}
static readonly List<ParsResult> _results = new List<ParsResult>();
static Int32 _maxDepth = 4;
static String Foo(String urlToCheck = null, Int32 depth = 0, ParsResult parent = null)
{
string email = "";
if (depth >= _maxDepth) return email;
String html;
using (var wc = new WebClient())
html = wc.DownloadString(urlToCheck ?? parent.Url);
var doc = new HtmlDocument();
doc.LoadHtml(html);
var aNods = doc.DocumentNode.SelectNodes("//a");
if (aNods == null || !aNods.Any()) return email;
foreach (var aNode in aNods)
{
var url = aNode.Attributes["href"];
if (url == null)
continue;
var wc2 = new WebClient();
String html2 = wc2.DownloadString(url.Value);
email = ExtractEmails(html2);
Console.WriteLine(email);
var result = new ParsResult
{
Depth = depth,
Parent = parent,
Url = url.Value
};
_results.Add(result);
Console.WriteLine("{0} - {1}", depth, result.Url);
Foo(depth: depth + 1, parent: result);
return email;
}
return email;
}
static void Main(string[] args)
{
String res = Foo("http://www.mobileridoda.com", 0);
Console.WriteLine("emails " + res);
}
I want to dispaly in console all emails extracted by all pages of all links that are inside DOM of Main page, But it dispalys no emails in console. How can I solve this issue ?
Thank you
Found a few things wrong but no worries, got the details on why and what to do to fix them.
In your foreach loop, when you go through the first URL, you are using a return statement at the end essentially breaking the loop and terminating. Use return only after you have processed ALL the URLs and accumulated the email addresses.
You are overwriting the email (i see it as a csv) when you go over the loop. Use += to continue adding. email = ExtractEmails(html2);
You are not returning anything when you call Foo within your forEach loop. You need to use email += Foo(xyz). Foo(depth: depth + 1, parent: result);
You are going through a URL that you have already processed... possibly causing an infinite cycle. I added a list of strings that keeps track of URLs you have already visited so as to prevent the infinite loop you might get into.
Here is a complete working solution.
static string ExtractEmails(string data)
{
//instantiate with this pattern
Regex emailRegex = new Regex(#"\w+([-+.]\w+)*#\w+([-.]\w+)*\.\w+([-.]\w+)*", RegexOptions.IgnoreCase);
//find items that matches with our pattern
MatchCollection emailMatches = emailRegex.Matches(data);
//StringBuilder sb = new StringBuilder();
string s = "";
foreach (Match emailMatch in emailMatches)
{
//sb.AppendLine(emailMatch.Value);
s += emailMatch.Value + ",";
}
return s;
}
static readonly List<ParsResult> _results = new List<ParsResult>();
static Int32 _maxDepth = 4;
static List<string> urlsAlreadyVisited = new List<string>();
static String Foo(String urlToCheck = null, Int32 depth = 0, ParsResult parent = null)
{
if (urlsAlreadyVisited.Contains(urlToCheck))
return string.Empty;
else
urlsAlreadyVisited.Add(urlToCheck);
string email = "";
if (depth >= _maxDepth) return email;
String html;
using (var wc = new WebClient())
html = wc.DownloadString(urlToCheck ?? parent.Url);
var doc = new HtmlDocument();
doc.LoadHtml(html);
var aNods = doc.DocumentNode.SelectNodes("//a");
if (aNods == null || !aNods.Any()) return email;
// Get Distinct URLs from all the URls on this page.
List<string> allUrls = aNods.ToList().Select(x => x.Attributes["href"].Value).Where(url => url.StartsWith("http")).Distinct().ToList();
foreach (string url in allUrls)
{
var wc2 = new WebClient();
try
{
email += ExtractEmails(wc2.DownloadString(url));
}
catch { /* Swallow Exception ... URL not found or other errors. */ continue; }
Console.WriteLine(email);
var result = new ParsResult
{
Depth = depth,
Parent = parent,
Url = url
};
_results.Add(result);
Console.WriteLine("{0} - {1}", depth, result.Url);
email += Foo(depth: depth + 1, parent: result);
}
return email;
}
public class ParsResult
{
public int Depth { get; set; }
public ParsResult Parent { get; set; }
public string Url { get; set; }
}
// ========== MAIN CLASS ==========
static void Main(string[] args)
{
String res = Foo("http://www.mobileridoda.com", 0);
Console.WriteLine("emails " + res);
}
I used svcUtil.exe to create classes from https://gw.sam.gov/SAMWS/1.0/Entity?wsdl
but I cannot for the life of me find what to deserialize the result into? When I created my own classes the root is envelope but it isn't even in the new classes.
I can paste the classes but it is really long? Is there a general answer to this?
I will paste the classes upon request.
Thanks in advance...
The classes are over 10x too long to post.
Adding code for the pull:
static void Main(string[] args)
{
ServicePointManager.Expect100Continue = true;
ServicePointManager.SecurityProtocol = SecurityProtocolType.Tls12;
var _url = "https://gw.sam.gov/SAMWS/1.0/Entity";
//Run date is a specific date if provided otherwise use yesterday
DateTime startDateTime = DateTime.Now.AddDays(-1).Date;
for (int hr = 0; hr < 24; hr++)
{
XMLclassRequest xmlSoap = new XMLclassRequest();
string soap = xmlSoap.BuildSOAPrequest(startDateTime.AddHours(hr));
//string soap2 = xmlSoap.BuildSOAPrequest2(startDateTime.AddHours(hr));
string response = null; //This is the original pull with FAR and DFAR Responses
//string response2 = null; //This is FAR and DFAR
using (MyWebClient client = new MyWebClient())
{
client.Headers.Add("Content-Type", "text/xml;charset=utf-8");
client.Headers.Add("SOAPAction", "\"http://tempuri.org/IMyService/MyOperation\"");
try
{
response = client.UploadString(_url, soap);
//response2 = client.UploadString(_url, soap2);
}
catch
{ } //This will skip the hour attempted and move to next. The error I have been receiving is no data which is differently formatted XML that causes the error
}
//File.WriteAllText(#"D:\temp\bigpull.xml", response);
MemoryStream stream = null;
if (response != null)
{
byte[] byteArray = Encoding.Unicode.GetBytes(response);
stream = new MemoryStream(byteArray);
}
getEntities results;
XmlSerializer serializer = new XmlSerializer(typeof(getEntities));
try
{ results = (getEntities)serializer.Deserialize(stream); }
catch
{ } //This will skip the hour attempted and move to next. The error I have been receiving is no data which is differently formatted XML that causes the error
stream.Close();
string str;
}
The response is coming back. I just can't get it into an object to use.
I couldn't get this loaded into an object but ended up loading it to an XDocument and parsed that:
var _url = "https://gw.sam.gov/SAMWS/1.0/Entity";
//Run date is a specific date if provided otherwise use yesterday
DateTime startDateTime = DateTime.Now.AddDays(-1).Date;
for (int hr = 0; hr < 24; hr++)
{
XMLclassRequest xmlSoap = new XMLclassRequest();
string soap = xmlSoap.BuildSOAPrequest(startDateTime.AddHours(hr));
//string soap2 = xmlSoap.BuildSOAPrequest2(startDateTime.AddHours(hr));
string response = null; //This is the original pull with FAR and DFAR Responses
//string response2 = null; //This is FAR and DFAR
using (MyWebClient client = new MyWebClient())
{
client.Headers.Add("Content-Type", "text/xml;charset=utf-8");
client.Headers.Add("SOAPAction", "\"http://tempuri.org/IMyService/MyOperation\"");
try
{
response = client.UploadString(_url, soap);
//response2 = client.UploadString(_url, soap2);
}
catch
{ } //This will skip the hour attempted and move to next. The error I have been receiving is no data which is differently formatted XML that causes the error
}
//File.WriteAllText(#"D:\temp\far20190604.xml", response);
XDocument xdoc = XDocument.Parse(response);
var entities = from e in xdoc.Descendants("entity") select e;
foreach (var e in entities)
{
string DUNS = e.Descendants("DUNS").FirstOrDefault().Value;
var provisions = from p in e.Descendants("provision") select p;
foreach (var p in provisions)
{
string ParentID = p.Descendants("id").FirstOrDefault().Value;
var answers = from a in p.Descendants("answer") select a;
foreach (var a in answers)
{
var section = a.Descendants("section").Count()>0? a.Descendants("section").FirstOrDefault().Value : "";
var answerText = a.Descendants("answerText").Count() > 0 ? a.Descendants("answerText").FirstOrDefault().Value : "";
Console.WriteLine(DUNS + " " + ParentID + " " + section + " " + answerText);
}
}
}
How do I get Content-Disposition parameters I returned from WebAPI controller using WebClient?
WebApi Controller
[Route("api/mycontroller/GetFile/{fileId}")]
public HttpResponseMessage GetFile(int fileId)
{
try
{
var file = GetSomeFile(fileId)
HttpResponseMessage response = new HttpResponseMessage(HttpStatusCode.OK);
response.Content = new StreamContent(new MemoryStream(file));
response.Content.Headers.ContentDisposition = new System.Net.Http.Headers.ContentDispositionHeaderValue("attachment");
response.Content.Headers.ContentDisposition.FileName = file.FileOriginalName;
/********* Parameter *************/
response.Content.Headers.ContentDisposition.Parameters.Add(new NameValueHeaderValue("MyParameter", "MyValue"));
return response;
}
catch(Exception ex)
{
return Request.CreateErrorResponse(HttpStatusCode.InternalServerError, ex);
}
}
Client
void DownloadFile()
{
WebClient wc = new WebClient();
wc.DownloadDataCompleted += wc_DownloadDataCompleted;
wc.DownloadDataAsync(new Uri("api/mycontroller/GetFile/18"));
}
void wc_DownloadDataCompleted(object sender, DownloadDataCompletedEventArgs e)
{
WebClient wc=sender as WebClient;
// Try to extract the filename from the Content-Disposition header
if (!String.IsNullOrEmpty(wc.ResponseHeaders["Content-Disposition"]))
{
string fileName = wc.ResponseHeaders["Content-Disposition"].Substring(wc.ResponseHeaders["Content-Disposition"].IndexOf("filename=") + 10).Replace("\"", ""); //FileName ok
/****** How do I get "MyParameter"? **********/
}
var data = e.Result; //File OK
}
I'm returning a file from WebApi controller, I'm attaching the file name in the response content headers, but also I'd like to return an aditional value.
In the client I'm able to get the filename, but how do I get the aditional parameter?
If you are working with .NET 4.5 or later, consider using the System.Net.Mime.ContentDisposition class:
string cpString = wc.ResponseHeaders["Content-Disposition"];
ContentDisposition contentDisposition = new ContentDisposition(cpString);
string filename = contentDisposition.FileName;
StringDictionary parameters = contentDisposition.Parameters;
// You have got parameters now
Edit:
otherwise, you need to parse Content-Disposition header according to it's specification.
Here is a simple class that performs the parsing, close to the specification:
class ContentDisposition {
private static readonly Regex regex = new Regex(
"^([^;]+);(?:\\s*([^=]+)=((?<q>\"?)[^\"]*\\k<q>);?)*$",
RegexOptions.Compiled
);
private readonly string fileName;
private readonly StringDictionary parameters;
private readonly string type;
public ContentDisposition(string s) {
if (string.IsNullOrEmpty(s)) {
throw new ArgumentNullException("s");
}
Match match = regex.Match(s);
if (!match.Success) {
throw new FormatException("input is not a valid content-disposition string.");
}
var typeGroup = match.Groups[1];
var nameGroup = match.Groups[2];
var valueGroup = match.Groups[3];
int groupCount = match.Groups.Count;
int paramCount = nameGroup.Captures.Count;
this.type = typeGroup.Value;
this.parameters = new StringDictionary();
for (int i = 0; i < paramCount; i++ ) {
string name = nameGroup.Captures[i].Value;
string value = valueGroup.Captures[i].Value;
if (name.Equals("filename", StringComparison.InvariantCultureIgnoreCase)) {
this.fileName = value;
}
else {
this.parameters.Add(name, value);
}
}
}
public string FileName {
get {
return this.fileName;
}
}
public StringDictionary Parameters {
get {
return this.parameters;
}
}
public string Type {
get {
return this.type;
}
}
}
Then you can use it in this way:
static void Main() {
string text = "attachment; filename=\"fname.ext\"; param1=\"A\"; param2=\"A\";";
var cp = new ContentDisposition(text);
Console.WriteLine("FileName:" + cp.FileName);
foreach (DictionaryEntry param in cp.Parameters) {
Console.WriteLine("{0} = {1}", param.Key, param.Value);
}
}
// Output:
// FileName:"fname.ext"
// param1 = "A"
// param2 = "A"
The only thing that should be considered when using this class is it does not handle parameters (or filename) without a double quotation.
Edit 2:
It can now handle file names without quotations.
You can parse out the content disposition using the following framework code:
var content = "attachment; filename=myfile.csv";
var disposition = ContentDispositionHeaderValue.Parse(content);
Then just take the pieces off of the disposition instance.
disposition.FileName
disposition.DispositionType
With .NET Core 3.1 and more the most simple solution is:
using var response = await Client.SendAsync(request);
response.Content.Headers.ContentDisposition.FileName
The value is there I just needed to extract it:
The Content-Disposition header is returned like this:
Content-Disposition = attachment; filename="C:\team.jpg"; MyParameter=MyValue
So I just used some string manipulation to get the values:
void wc_DownloadDataCompleted(object sender, DownloadDataCompletedEventArgs e)
{
WebClient wc=sender as WebClient;
// Try to extract the filename from the Content-Disposition header
if (!String.IsNullOrEmpty(wc.ResponseHeaders["Content-Disposition"]))
{
string[] values = wc.ResponseHeaders["Content-Disposition"].Split(';');
string fileName = values.Single(v => v.Contains("filename"))
.Replace("filename=","")
.Replace("\"","");
/********** HERE IS THE PARAMETER ********/
string myParameter = values.Single(v => v.Contains("MyParameter"))
.Replace("MyParameter=", "")
.Replace("\"", "");
}
var data = e.Result; //File ok
}
As #Mehrzad Chehraz said you can use the new ContentDisposition class.
using System.Net.Mime;
// file1 is a HttpResponseMessage
FileName = new ContentDisposition(file1.Content.Headers.ContentDisposition.ToString()).FileName
Here is a sample of my code.
Here I recieve a string variable from another page.
protected override void OnNavigatedTo(System.Windows.Navigation.NavigationEventArgs e)
{
base.OnNavigatedTo(e);
string newparameter = this.NavigationContext.QueryString["search"];
weareusingxml();
displayResults(newparameter);
}
private void displayResults(string search)
{
bool flag = false;
try
{
using (IsolatedStorageFile myIsolatedStorage = IsolatedStorageFile.GetUserStoreForApplication())
{
using (IsolatedStorageFileStream stream = myIsolatedStorage.OpenFile("People.xml", FileMode.Open))
{
XmlSerializer serializer = new XmlSerializer(typeof(List<Person>));
List<Person> data = (List<Person>)serializer.Deserialize(stream);
List<Result> results = new List<Result>();
for (int i = 0; i < data.Count; i++)
{
string temp1 = data[i].name.ToUpper();
string temp2 = "*" + search.ToUpper() + "*";
if (temp1 == temp2)
{
results.Add(new Result() {name = data[i].name, gender = data[i].gender, pronouciation = data[i].pronouciation, definition = data[i].definition, audio = data[i].audio });
flag = true;
}
}
this.listBox.ItemsSource = results;
}
catch
{
textBlock1.Text = "error loading page";
}
if(!flag)
{
textBlock1.Text = "no matching results";
}
}
Nothing is loaded into the list when the code is run, I just get the message "no matching results".
Looks like you are trying to do a contains search (my guess based on your addition of the * around the search string. You can remove the '*' and do a string.Contains match.
Try this.
string temp1 = data[i].name.ToUpper();
string temp2 = search.ToUpper()
if (temp1.Contains(temp2))
{
It looks like you are trying to check if one string contains another (ie substring match) and not if they are equal.
In C#, you do this like this:
haystack = "Applejuice box";
needle = "juice";
if (haystack.Contains(needle))
{
// Match
}
Or, in your case (and skip the * you added to the string temp2)
if (temp1.Contains(temp2))
{
// add them to the list
}
Have you checked to make sure data.Count > 0?