I have a txt file that the format is:
0.32423 1.3453 3.23423
0.12332 3.1231 9.23432432
9.234324234 -1.23432 12.23432
...
Each line has three double value. There are more than 10000 lines in this file. I can use the ReadStream.ReadLine and use the String.Split, then convert it.
I want to know is there any faster method to do it.
Best Regards,
StreamReader.ReadLine, String.Split and Double.TryParse sounds like a good solution here.
No need for improvement.
There may be some little micro-optimisations you can perform, but the way you've suggested sounds about as simple as you'll get.
10000 lines shouldn't take very long - have you tried it and found you've actually got a performance problem? For example, here are two short programs - one creates a 10,000 line file and the other reads it:
CreateFile.cs:
using System;
using System.IO;
public class Test
{
static void Main()
{
Random rng = new Random();
using (TextWriter writer = File.CreateText("test.txt"))
{
for (int i = 0; i < 10000; i++)
{
writer.WriteLine("{0} {1} {2}", rng.NextDouble(),
rng.NextDouble(), rng.NextDouble());
}
}
}
}
ReadFile.cs:
using System;
using System.Diagnostics;
using System.IO;
using System.Linq;
public class Test
{
static void Main()
{
Stopwatch sw = Stopwatch.StartNew();
using (TextReader reader = File.OpenText("test.txt"))
{
string line;
while ((line = reader.ReadLine()) != null)
{
string[] bits = line.Split(' ');
foreach (string bit in bits)
{
double value;
if (!double.TryParse(bit, out value))
{
Console.WriteLine("Bad value");
}
}
}
}
sw.Stop();
Console.WriteLine("Total time: {0}ms",
sw.ElapsedMilliseconds);
}
}
On my netbook (which admittedly has an SSD in) it only takes 82ms to read the file. I would suggest that's probably not a problem :)
I would suggest reading all your lines at once with
string[] lines = System.IO.File.ReadAllLines(fileName);
This wold ensure that the I/O is done with the maximum efficiency. You woul have to measure (profile) but I would expect the conversions to take far less time.
your method is already good!
you can improve it by writing a readline function that returns an array of double and you reuse this function in other programs.
This solution is a little bit slower (see benchmarks at the end), but its nicer to read. It should also be more memory efficient because only the current character is buffered at the time (instead of the whole file or line).
Reading arrays is an additional feature in this reader which assumes that the size of the array always comes first as an int-value.
IParsable is another feature, that makes it easy to implement Parse methods for various types.
class StringSteamReader {
private StreamReader sr;
public StringSteamReader(StreamReader sr) {
this.sr = sr;
this.Separator = ' ';
}
private StringBuilder sb = new StringBuilder();
public string ReadWord() {
eol = false;
sb.Clear();
char c;
while (!sr.EndOfStream) {
c = (char)sr.Read();
if (c == Separator) break;
if (IsNewLine(c)) {
eol = true;
char nextch = (char)sr.Peek();
while (IsNewLine(nextch)) {
sr.Read(); // consume all newlines
nextch = (char)sr.Peek();
}
break;
}
sb.Append(c);
}
return sb.ToString();
}
private bool IsNewLine(char c) {
return c == '\r' || c == '\n';
}
public int ReadInt() {
return int.Parse(ReadWord());
}
public double ReadDouble() {
return double.Parse(ReadWord());
}
public bool EOF {
get { return sr.EndOfStream; }
}
public char Separator { get; set; }
bool eol;
public bool EOL {
get { return eol || sr.EndOfStream; }
}
public T ReadObject<T>() where T : IParsable, new() {
var obj = new T();
obj.Parse(this);
return obj;
}
public int[] ReadIntArray() {
int size = ReadInt();
var a = new int[size];
for (int i = 0; i < size; i++) {
a[i] = ReadInt();
}
return a;
}
public double[] ReadDoubleArray() {
int size = ReadInt();
var a = new double[size];
for (int i = 0; i < size; i++) {
a[i] = ReadDouble();
}
return a;
}
public T[] ReadObjectArray<T>() where T : IParsable, new() {
int size = ReadInt();
var a = new T[size];
for (int i = 0; i < size; i++) {
a[i] = ReadObject<T>();
}
return a;
}
internal void NextLine() {
eol = false;
}
}
interface IParsable {
void Parse(StringSteamReader r);
}
It can be used like this:
public void Parse(StringSteamReader r) {
double x = r.ReadDouble();
int y = r.ReadInt();
string z = r.ReadWord();
double[] arr = r.ReadDoubleArray();
MyParsableObject o = r.ReadObject<MyParsableObject>();
MyParsableObject [] oarr = r.ReadObjectArray<MyParsableObject>();
}
I did some benchmarking, comparing StringStreamReader with some other approaches, already proposed (StreamReader.ReadLine and File.ReadAllLines). Here are the methods I used for benchmarking:
private static void Test_StringStreamReader(string filename) {
var sw = new Stopwatch();
sw.Start();
using (var sr = new StreamReader(new FileStream(filename, FileMode.Open, FileAccess.Read))) {
var r = new StringSteamReader(sr);
r.Separator = ' ';
while (!r.EOF) {
var dbls = new List<double>();
while (!r.EOF) {
dbls.Add(r.ReadDouble());
}
}
}
sw.Stop();
Console.WriteLine("elapsed: {0}", sw.Elapsed);
}
private static void Test_ReadLine(string filename) {
var sw = new Stopwatch();
sw.Start();
using (var sr = new StreamReader(new FileStream(filename, FileMode.Open, FileAccess.Read))) {
var dbls = new List<double>();
while (!sr.EndOfStream) {
string line = sr.ReadLine();
string[] bits = line.Split(' ');
foreach(string bit in bits) {
dbls.Add(double.Parse(bit));
}
}
}
sw.Stop();
Console.WriteLine("elapsed: {0}", sw.Elapsed);
}
private static void Test_ReadAllLines(string filename) {
var sw = new Stopwatch();
sw.Start();
string[] lines = System.IO.File.ReadAllLines(filename);
var dbls = new List<double>();
foreach(var line in lines) {
string[] bits = line.Split(' ');
foreach (string bit in bits) {
dbls.Add(double.Parse(bit));
}
}
sw.Stop();
Console.WriteLine("Test_ReadAllLines: {0}", sw.Elapsed);
}
I used a file with 1.000.000 lines of double values (3 values each line). File is located on a SSD disk and each test was repeated multiple times in release-mode. These are the results (on average):
Test_StringStreamReader: 00:00:01.1980975
Test_ReadLine: 00:00:00.9117553
Test_ReadAllLines: 00:00:01.1362452
So, as mentioned StringStreamReader is a bit slower than the other approaches. For 10.000 lines, the performance is around (120ms / 95ms / 100ms).
Related
This question already has answers here:
Edit a specific Line of a Text File in C#
(6 answers)
Closed 5 years ago.
I have to write an implementation of string that stores it's values on hard drive instead of ram (I know how stupid it sounds, but it's intended to teach us how different sorting algorithms work on ram and hard drive). This is what I've written so far:
class HDDArray : IEnumerable<int>
{
private string filePath;
public int this[int index]
{
get
{
using (var reader = new StreamReader(filePath))
{
string line = reader.ReadLine();
for (int i = 0; i < index; i++)
{
line = reader.ReadLine();
}
return Convert.ToInt32(line);
}
}
set
{
using (var fs = File.Open(filePath, FileMode.OpenOrCreate, FileAccess.ReadWrite))
{
var reader = new StreamReader(fs);
var writer = new StreamWriter(fs);
for (int i = 0; i < index; i++)
{
reader.ReadLine();
}
writer.WriteLine(value);
writer.Dispose();
}
}
}
public int Length
{
get
{
int length = 0;
using (var reader = new StreamReader(filePath))
{
while (reader.ReadLine() != null)
{
length++;
}
}
return length;
}
}
public HDDArray(string file)
{
filePath = file;
if (File.Exists(file))
File.WriteAllText(file, String.Empty);
else
File.Create(file).Dispose();
}
public IEnumerator<int> GetEnumerator()
{
using (var reader = new StreamReader(filePath))
{
string line;
while ((line = reader.ReadLine()) != null)
{
yield return Convert.ToInt32(line);
}
}
}
IEnumerator IEnumerable.GetEnumerator()
{
return GetEnumerator();
}
}
The problem I'm facing is when trying to edit a line (in the the set portion of the indexer) I end up adding a new line instead of editing the old one (it's pretty obvious why, I just can't figure how to fix it).
Your array is designed to work with integers. Such a class is quite easy to create because the length of all numbers is 4 bytes.
class HDDArray : IEnumerable<int>, IDisposable
{
readonly FileStream stream;
readonly BinaryWriter writer;
readonly BinaryReader reader;
public HDDArray(string file)
{
stream = new FileStream(file, FileMode.Create, FileAccess.ReadWrite);
writer = new BinaryWriter(stream);
reader = new BinaryReader(stream);
}
public int this[int index]
{
get
{
stream.Position = index * 4;
return reader.ReadInt32();
}
set
{
stream.Position = index * 4;
writer.Write(value);
}
}
public int Length
{
get
{
return (int)stream.Length / 4;
}
}
public IEnumerator<int> GetEnumerator()
{
stream.Position = 0;
while (reader.PeekChar() != -1)
yield return reader.ReadInt32();
}
IEnumerator IEnumerable.GetEnumerator()
{
return GetEnumerator();
}
public void Dispose()
{
reader?.Dispose();
writer?.Dispose();
stream?.Dispose();
}
}
Since the size of each array element is known, we can simply move to stream by changing its Position property.
BinaryWriter and BinaryReader are very comfortable to write and read numbers.
Open stream is a very heavy operation. Hence do it once when you create the class. At the end of the work, you need to clean up after themselves. So I implemented the IDisposable interface.
Usage:
HDDArray arr = new HDDArray("test.dat");
Console.WriteLine("Length: " + arr.Length);
for (int i = 0; i < 10; i++)
arr[i] = i;
Console.WriteLine("Length: " + arr.Length);
foreach (var n in arr)
Console.WriteLine(n);
// Console.WriteLine(arr[20]); // Exception!
arr.Dispose(); // release resources
I stand to be corrected, but I dont think there is an easy way to re-write a specific line, so you will probably find it easier to rewrite the file - modifying that line.
You could change your set code as follows:
set
{
var allLinesInFile = File.ReadAllLines(filepath);
allLinesInFile[index] = value;
File.WriteAllLines(filepath, allLinesInFile);
}
Goes without saying that there should be some safety checks in there to check the file exists and index < allLinesInFile.Length
I think for the sake of homework of sorting algorithms you needn't bother yourself memory size issues.
Of course please add checking file existing to read.
Note: Line counting in example starts from 0.
string[] lines = File.ReadAllLines(filePath);
using (StreamWriter writer = new StreamWriter(filePath))
{
for (int currentLineNmb = 0; currentLineNmb < lines.Length; currentLineNmb++ )
{
if (currentLineNmb == lineToEditNmb)
{
writer.WriteLine(lineToWrite);
continue;
}
writer.WriteLine(lines[currentLineNmb]);
}
}
I am writing a class that contains a method that should return an array after reading from file, but it always return an empty array. When I call the method without class just in main file everything works fine.
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace ProjectX
{
class ReadFile
{
private System.IO.StreamReader file;
public ReadFile()
{
file = new System.IO.StreamReader(#"E:\\XXXX.csv");
}
private int DataLenght()
{
String line = String.Empty;
int lenght = 0;
line = string.Empty;
while ((line = file.ReadLine()) != null)
{
lenght++;
}
return lenght;
}
public double[] ReadFromFile()
{
double[] arr;
String line = String.Empty;
double[] data = new double[DataLenght()];
string[] dataString = new string[DataLenght()];
List<double> listaDouble = new List<double>();
int x = 0;
String[] parts_of_line;
while ((line = file.ReadLine()) != null)
{
parts_of_line = line.Split(',');
for (int i = 0; i < parts_of_line.Length; i++)
{
parts_of_line[i] = parts_of_line[i].Trim();
data[i] = double.Parse(dataString[i]);
listaDouble.Add(data[i]);
}
}
arr = listaDouble.ToArray();
return arr;
}
}
}
and then in main I want to:
ReadFile read = new ReadFile();
double[] arr = read.ReadFromFile;
Sorry for few not needed conversion in ReadFromFile method but try few things to make it works.
You have a number of problems. As your question is at the time of this answer your problem is that you have read the stream and it is now at the end as Andrei Rudik points out. You would have to set it back to the beginning before you try to read it each time:
file.BaseStream.Position = 0;
However, you have more issues after that. Your entire class can be refactored to this, read it carefully and understand what is happening:
class ReadFile
{
public double[] ReadFromFile()
{
List<double> listaDouble = new List<double>();
using(var file = new System.IO.StreamReader(#"E:\XXXX.csv"))
{
string line = "";
while ((line = file.ReadLine()) != null)
{
string[] linetokens = line.Split(',');
listaDouble.AddRange(linetokens.Select (l => double.Parse(l)));
}
}
return listaDouble.ToArray();
}
}
Note, this assumes your line tokens always parse to double which is an error prone assumption.
I am creating a word list of possible uppercase letters to prove how insecure 8 digit passwords are this code will write aaaaaaaa to aaaaaaab to aaaaaaac etc. until zzzzzzzz using this code:
class Program
{
static string path;
static int file = 0;
static void Main(string[] args)
{
new_file();
var alphabet = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ123456789+-*_!$£^=<>§°ÖÄÜöäü.;:,?{}[]";
var q = alphabet.Select(x => x.ToString());
int size = 3;
int counter = 0;
for (int i = 0; i < size - 1; i++)
{
q = q.SelectMany(x => alphabet, (x, y) => x + y);
}
foreach (var item in q)
{
if (counter >= 20000000)
{
new_file();
counter = 0;
}
if (File.Exists(path))
{
using (StreamWriter sw = File.AppendText(path))
{
sw.WriteLine(item);
Console.WriteLine(item);
/*if (!(Regex.IsMatch(item, #"(.)\1")))
{
sw.WriteLine(item);
counter++;
}
else
{
Console.WriteLine(item);
}*/
}
}
else
{
new_file();
}
}
}
static void new_file()
{
path = #"C:\" + "list" + file + ".txt";
if (!File.Exists(path))
{
using (StreamWriter sw = File.CreateText(path))
{
}
}
file++;
}
}
The Code is working fine but it takes Weeks to run it. Does anyone know a way to speed it up or do I have to wait? If anyone has a idea please tell me.
Performance:
size 3: 0.02s
size 4: 1.61s
size 5: 144.76s
Hints:
removed LINQ for combination generation
removed Console.WriteLine for each password
removed StreamWriter
large buffer (128k) for file writing
const string alphabet = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ123456789+-*_!$£^=<>§°ÖÄÜöäü.;:,?{}[]";
var byteAlphabet = alphabet.Select(ch => (byte)ch).ToArray();
var alphabetLength = alphabet.Length;
var newLine = new[] { (byte)'\r', (byte)'\n' };
const int size = 4;
var number = new byte[size];
var password = Enumerable.Range(0, size).Select(i => byteAlphabet[0]).Concat(newLine).ToArray();
var watcher = new System.Diagnostics.Stopwatch();
watcher.Start();
var isRunning = true;
for (var counter = 0; isRunning; counter++)
{
Console.Write("{0}: ", counter);
Console.Write(password.Select(b => (char)b).ToArray());
using (var file = System.IO.File.Create(string.Format(#"list.{0:D5}.txt", counter), 2 << 16))
{
for (var i = 0; i < 2000000; ++i)
{
file.Write(password, 0, password.Length);
var j = size - 1;
for (; j >= 0; j--)
{
if (number[j] < alphabetLength - 1)
{
password[j] = byteAlphabet[++number[j]];
break;
}
else
{
number[j] = 0;
password[j] = byteAlphabet[0];
}
}
if (j < 0)
{
isRunning = false;
break;
}
}
}
}
watcher.Stop();
Console.WriteLine(watcher.Elapsed);
}
Try the following modified code. In LINQPad it runs in < 1 second. With your original code I gave up after 40 seconds. It removes the overhead of opening and closing the file for every WriteLine operation. You'll need to test and ensure it gives the same results because I'm not willing to run your original code for 24 hours to ensure the output is the same.
class Program
{
static string path;
static int file = 0;
static void Main(string[] args)
{
new_file();
var alphabet = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ123456789+-*_!$£^=<>§°ÖÄÜöäü.;:,?{}[]";
var q = alphabet.Select(x => x.ToString());
int size = 3;
int counter = 0;
for (int i = 0; i < size - 1; i++)
{
q = q.SelectMany(x => alphabet, (x, y) => x + y);
}
StreamWriter sw = File.AppendText(path);
try
{
foreach (var item in q)
{
if (counter >= 20000000)
{
sw.Dispose();
new_file();
counter = 0;
}
sw.WriteLine(item);
Console.WriteLine(item);
}
}
finally
{
if(sw != null)
{
sw.Dispose();
}
}
}
static void new_file()
{
path = #"C:\temp\list" + file + ".txt";
if (!File.Exists(path))
{
using (StreamWriter sw = File.CreateText(path))
{
}
}
file++;
}
}
your alphabet is missing 0
With that fixed there would be 89 chars in your set. Let's call it 100 for simplicity. The set you are looking for is all the 8 character length strings drawn from that set. There are 100^8 of these, i.e. 10,000,000,000,000,000.
The disk space they will take up depends on how you encode them, lets be generous - assume you use some 8 bit char set that contains the these characters, and you don't put in carriage returns, so one byte per char, so 10,000,000,000,000,000 bytes =~ 10 peta byes?
Do you have 10 petabytes of disk? (10000 TB)?
[EDIT] In response to 'this is not an answer':
The original motivation is to create the list? The shows how large the list would be. Its hard to see what could be DONE with the list if it was actualised, i.e. it would always be quicker to reproduce it than to load it. Surely whatever point could be made by producing the list can also be made by simply knowing it's size, which the above shows how to work it out.
There are LOTS of inefficiencies in you code, but if your questions is 'how can i quickly produce this list and write it to disk' the answer is 'you literally cannot'.
[/EDIT]
I am reading IMDB movies listing from a text file on my harddrive (originally available from IMDB site at ftp://ftp.fu-berlin.de/pub/misc/movies/database/movies.list.gz).
It takes around 5 minutes on my machine (basic info: Win7 x64bit, 16GB RAM, 500 GB SATA Hardisk 7200 RPM) to read this file line by line using code below.
I have two questions:
Is there any way I can optimize code to improve the read time?
Data access don't need to be sequential as I won't mind reading data from top to bottom / bottom to top or any order for that matter as long as it read one line at a time. I am wondering is there a way to read in multiple directions to improve the read time?
The application is a Windows Console Application.
Update: Many responses correctly pointed out that Writing to the Console takes substantial time. Considering that the displaying of data on the Windows Console is now desirable but not mandatory.
//Code Block
string file = #"D:\movies.list";
FileStream fs = new FileStream(file, FileMode.Open, FileAccess.Read, FileShare.None, 8, FileOptions.None);
using (StreamReader sr = new StreamReader(fs))
{
while (sr.Peek() >= 0)
{
Console.WriteLine(sr.ReadLine());
}
}
I'm not certain whether this is more efficient or not, but an alternate method would be to use File.ReadAllLines:
var movieFile = File.ReadAllLines(file);
foreach (var movie in movieFile)
Console.WriteLine(movie);
I am not a c# developer, but how about doing a bulk insert into database using the file(which will be one time). Then you can reuse the data and export as well.
In .net 4 you can use File.ReadLines for lazy evaluation and thus lower RAM usage when working on large files.
You can do linq operation directly on files and this along with File.ReadLines would improve load time.
For better understanding you can check, Read text file word-by-word using LINQ
You can also do comparison as well but putting time intervals.
However if you making web app you can read whole file on application start event and cache them in application pool for better performanace.
First of all, if you don't care about printing out the list to console, please edit your question.
Second, I created a timing program to test the speeds of the different methods suggested:
class Program
{
private static readonly string file = #"movies.list";
private static readonly int testStart = 1;
private static readonly int numOfTests = 2;
private static readonly int MinTimingVal = 1000;
private static string[] testNames = new string[] {
"Naive",
"OneCallToWrite",
"SomeCallsToWrite",
"InParallel",
"InParallelBlcoks",
"IceManMinds",
"TestTiming"
};
private static double[] avgSecs = new double[numOfTests];
private static int[] testIterations = new int[numOfTests];
public static void Main(string[] args)
{
Console.WriteLine("Starting tests...");
Debug.WriteLine("Starting tests...");
Console.WriteLine("");
Debug.WriteLine("");
//*****************************
//The console is the bottle-neck, so we can
//speed-up redrawing it by only showing 1 line at a time.
Console.WindowHeight = 1;
Console.WindowWidth = 50;
Console.BufferHeight = 100;
Console.BufferWidth = 50;
//******************************
Action[] actionArray = new Action[numOfTests];
actionArray[0] = naive;
actionArray[1] = oneCallToWrite;
actionArray[2] = someCallsToWrite;
actionArray[3] = inParallel;
actionArray[4] = inParallelBlocks;
actionArray[5] = iceManMinds;
actionArray[6] = testTiming;
for (int i = testStart; i < actionArray.Length; i++)
{
Action a = actionArray[i];
DoTiming(a, i);
}
printResults();
Console.WriteLine("");
Debug.WriteLine("");
Console.WriteLine("Tests complete.");
Debug.WriteLine("Tests complete.");
Console.WriteLine("Press Enter to Close Console...");
Debug.WriteLine("Press Enter to Close Console...");
Console.ReadLine();
}
private static void DoTiming(Action a, int num)
{
a.Invoke();
Stopwatch watch = new Stopwatch();
Stopwatch loopWatch = new Stopwatch();
bool shouldRetry = false;
int numOfIterations = 2;
do
{
watch.Start();
for (int i = 0; i < numOfIterations; i++)
{
a.Invoke();
}
watch.Stop();
shouldRetry = false;
if (watch.ElapsedMilliseconds < MinTimingVal) //if the time was less than the minimum, increase load and re-time.
{
shouldRetry = true;
numOfIterations *= 2;
watch.Reset();
}
} while (shouldRetry);
long totalTime = watch.ElapsedMilliseconds;
double avgTime = ((double)totalTime) / (double)numOfIterations;
avgSecs[num] = avgTime / 1000.00;
testIterations[num] = numOfIterations;
}
private static void printResults()
{
Console.WriteLine("");
Debug.WriteLine("");
for (int i = testStart; i < numOfTests; i++)
{
TimeSpan t = TimeSpan.FromSeconds(avgSecs[i]);
Console.WriteLine("ElapsedTime: {0:N4}, " + "test: " + testNames[i], t.ToString() );
Debug.WriteLine("ElapsedTime: {0:N4}, " + "test: " + testNames[i], t.ToString() );
}
}
public static void naive()
{
FileStream fs = new FileStream(file, FileMode.Open, FileAccess.Read, FileShare.None, 8, FileOptions.None);
using (StreamReader sr = new StreamReader(fs))
{
while (sr.Peek() >= 0)
{
Console.WriteLine( sr.ReadLine() );
}
}
}
public static void oneCallToWrite()
{
FileStream fs = new FileStream(file, FileMode.Open, FileAccess.Read, FileShare.None, 8, FileOptions.None);
using (StreamReader sr = new StreamReader(fs))
{
StringBuilder sb = new StringBuilder();
while (sr.Peek() >= 0)
{
string s = sr.ReadLine();
sb.Append("\n" + s);
}
Console.Write(sb);
}
}
public static void someCallsToWrite()
{
FileStream fs = new FileStream(file, FileMode.Open, FileAccess.Read, FileShare.None, 8, FileOptions.None);
using (StreamReader sr = new StreamReader(fs))
{
StringBuilder sb = new StringBuilder();
int count = 0;
int mod = 10000;
while (sr.Peek() >= 0)
{
count++;
string s = sr.ReadLine();
sb.Append("\n" + s);
if (count % mod == 0)
{
Console.Write(sb);
sb = new StringBuilder();
}
}
Console.Write( sb );
}
}
public static void inParallel()
{
string[] wordsFromFile = File.ReadAllLines( file );
int length = wordsFromFile.Length;
Parallel.For( 0, length, i => {
Console.WriteLine( wordsFromFile[i] );
});
}
public static void inParallelBlocks()
{
string[] wordsFromFile = File.ReadAllLines(file);
int length = wordsFromFile.Length;
Parallel.For<StringBuilder>(0, length,
() => { return new StringBuilder(); },
(i, loopState, sb) =>
{
sb.Append("\n" + wordsFromFile[i]);
return sb;
},
(x) => { Console.Write(x); }
);
}
#region iceManMinds
public static void iceManMinds()
{
string FileName = file;
long ThreadReadBlockSize = 50000;
int NumberOfThreads = 4;
byte[] _inputString;
var fi = new FileInfo(FileName);
long totalBytesRead = 0;
long fileLength = fi.Length;
long readPosition = 0L;
Console.WriteLine("Reading Lines From {0}", FileName);
var threads = new Thread[NumberOfThreads];
var instances = new ReadThread[NumberOfThreads];
_inputString = new byte[fileLength];
while (totalBytesRead < fileLength)
{
for (int i = 0; i < NumberOfThreads; i++)
{
var rt = new ReadThread { StartPosition = readPosition, BlockSize = ThreadReadBlockSize };
instances[i] = rt;
threads[i] = new Thread(rt.Read);
threads[i].Start();
readPosition += ThreadReadBlockSize;
}
for (int i = 0; i < NumberOfThreads; i++)
{
threads[i].Join();
}
for (int i = 0; i < NumberOfThreads; i++)
{
if (instances[i].BlockSize > 0)
{
Array.Copy(instances[i].Output, 0L, _inputString, instances[i].StartPosition,
instances[i].BlockSize);
totalBytesRead += instances[i].BlockSize;
}
}
}
string finalString = Encoding.ASCII.GetString(_inputString);
Console.WriteLine(finalString);//.Substring(104250000, 50000));
}
private class ReadThread
{
public long StartPosition { get; set; }
public long BlockSize { get; set; }
public byte[] Output { get; private set; }
public void Read()
{
Output = new byte[BlockSize];
var inStream = new FileStream(file, FileMode.Open, FileAccess.Read, FileShare.ReadWrite);
inStream.Seek(StartPosition, SeekOrigin.Begin);
BlockSize = inStream.Read(Output, 0, (int)BlockSize);
inStream.Close();
}
}
#endregion
public static void testTiming()
{
Thread.Sleep(500);
}
}
Each of these tests print the file out to console.
When run under default Console settings, each test took between 5:30 and 6:10 (Min:Sec).
After considering the Console properties, by making Console.WindowHeight = 1, that is, only 1 line is shown at a time, (you can scroll up and down to see the most recent 100 lines), and I achieved a speed-up.
Currently, the task completes in just a little over 2:40 (Min:Sec) for most methods.
Try it out on your computer and see how it works for you.
Interestingly enough, the different methods were basically equivalent, with the OP's code being basically the fastest.
The timing code warms-up the code then runs it twice and averages the time it takes, it does this for each method.
Feel free to try out your own methods and time them.
The answer to this question really depends on what it is you will be doing with the data. If your intention truly is to just read in the file and dump the contents to the console screen, then it would be better to use the StringBuilder Class to build up a string of, say 1000 lines, then dump the contents to the screen, reset the string then read in another 1000 lines, dump them, etc etc...
However if you are trying to build something that is part of a larger project and you are using .NET 4.0, you can use the MemoryMappedFile Class to read the file and create a CreateViewAccessor to create a "window" that operates on just a portion of the data instead of reading in the entire file.
Another option would be to make Threads that read different parts of the file all at once, then puts it all together in the end.
If you can be more specific as to what you plan to do with this data, I can help you more. Hope this helps!
EDIT:
Try this code out man. I was able to read the whole list in literally 3 seconds time using Threads:
using System;
using System.IO;
using System.Text;
using System.Threading;
namespace ConsoleApplication36
{
class Program
{
private const string FileName = #"C:\Users\Public\movies.list";
private const long ThreadReadBlockSize = 50000;
private const int NumberOfThreads = 4;
private static byte[] _inputString;
static void Main(string[] args)
{
var fi = new FileInfo(FileName);
long totalBytesRead = 0;
long fileLength = fi.Length;
long readPosition = 0L;
Console.WriteLine("Reading Lines From {0}", FileName);
var threads = new Thread[NumberOfThreads];
var instances = new ReadThread[NumberOfThreads];
_inputString = new byte[fileLength];
while (totalBytesRead < fileLength)
{
for (int i = 0; i < NumberOfThreads; i++)
{
var rt = new ReadThread { StartPosition = readPosition, BlockSize = ThreadReadBlockSize };
instances[i] = rt;
threads[i] = new Thread(rt.Read);
threads[i].Start();
readPosition += ThreadReadBlockSize;
}
for (int i = 0; i < NumberOfThreads; i++)
{
threads[i].Join();
}
for (int i = 0; i < NumberOfThreads; i++)
{
if (instances[i].BlockSize > 0)
{
Array.Copy(instances[i].Output, 0L, _inputString, instances[i].StartPosition,
instances[i].BlockSize);
totalBytesRead += instances[i].BlockSize;
}
}
}
string finalString = Encoding.ASCII.GetString(_inputString);
Console.WriteLine(finalString.Substring(104250000, 50000));
}
private class ReadThread
{
public long StartPosition { get; set; }
public long BlockSize { get; set; }
public byte[] Output { get; private set; }
public void Read()
{
Output = new byte[BlockSize];
var inStream = new FileStream(FileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite);
inStream.Seek(StartPosition, SeekOrigin.Begin);
BlockSize = inStream.Read(Output, 0, (int)BlockSize);
inStream.Close();
}
}
}
}
You will need to change the FileName to match the location of your movies.list file. Also, you can adjust the total number of threads. I used 4, but you can decrease or increase this at will. You can also change the Block Size...This is how much data each thread reads in. Also, I'm assuming its an ASCII text file. If its not, you need to change the encoding type to UTF8 or whatever encoding the file is in. Good luck!
I have a text box field inputs 123,145,125 I to separate this field into an array of integers. And validate this field true or false if everything is parsed right.
CODE:
private bool chkID(out int[] val)
{
char[] delimiters = new char[] { ',' };
string[] strSplit = iconeID.Text.Split(delimiters);
int[] intArr = null;
foreach (string s in strSplit) //splits the new parsed characters
{
int tmp;
tmp = 0;
if (Int32.TryParse(s, out tmp))
{
if (intArr == null)
{
intArr = new int[1];
}
else
{
Array.Resize(ref intArr, intArr.Length + 1);
}
intArr[intArr.Length - 1] = tmp;
}
if (Int32.TryParse(iconeID.Text, out tmp))
{
iconeID.BorderColor = Color.Empty;
iconeID.BorderWidth = Unit.Empty;
tmp = int.Parse(iconeID.Text);
val = new int[1];
val[0] = tmp;
return true;
}
}
val = null;
ID.BorderColor = Color.Red;
ID.BorderWidth = 2;
return false;
}
//new Code:
private bool chkID(out int[] val) //bool satus for checkID function
{
string[] split = srtID.Text.Split(new char[1] {','});
List numbers = new List();
int parsed;
bool isOk = true;
foreach( string n in split){
if(Int32.TryParse( n , out parsed))
numbers.Add(parsed);
else
isOk = false;
}
if (isOk){
strID.BorderColor=Color.Empty;
strID.BorderWidth=Unit.Empty;
return true;
} else{
strID.BorderColor=Color.Red;
strID.BorderWidth=2;
return false;
}
return numbers.ToArray();
}
The given function seems to do too much. Here's one that answers the question implied by your title:
//int[] x = SplitStringIntoInts("1,2,3, 4, 5");
static int[] SplitStringIntoInts(string list)
{
string[] split = list.Split(new char[1] { ',' });
List<int> numbers = new List<int>();
int parsed;
foreach (string n in split)
{
if (int.TryParse(n, out parsed))
numbers.Add(parsed);
}
return numbers.ToArray();
}
EDIT (based on your comment on the question)
You've defined the three things this function needs to do. Now you just need to create methods for each. Below are my guesses for how you could implement them.
int[] ValidateIDs(int[] allIDs)
{
List<int> validIDs = new List<int>(allIDs);
//remove invalid IDs
return validIDs.ToArray();
}
void DownloadXmlData(int[] ids)
{
...
}
Now you just execute your new functions:
void CheckIconeID(string ids)
{
int[] allIDs = SplitStringIntoInts(ids);
int[] validIDs = ValidateIDs(allIDs);
DownloadXmlData(validIDs);
}
I really wanted to comment on #Austin Salonen's answer, but it didn't fit. It is a great answer for the question asked, but i wanted to expand the discussion a bit more generally on csv/int conversion part.
It's small point, not worth much debate but I would consider swapping the foreach loop for a plain for loop. You'll likely end up with simpler IL (read faster). See (http://www.codeproject.com/KB/cs/foreach.aspx, http://msdn.microsoft.com/en-us/library/ms973839.aspx [Use For Loops for String Iteration—version 1]).
I would create two methods -- one that is safe and uses TryParse and only adds the "good" values, another that is not as safe, but faster.
Proposed "safe" function (with overload in case you don't want to know the bad values)...
public static int[] SplitAsIntSafe (this string csvString) {
List<string> badVals;
return SplitAsIntSafe(csvString, ',', out badVals);
}
public static int[] SplitAsIntSafe (this string delimitedString, char splitChar, out List<string> badVals) {
int parsed;
string[] split = delimitedString.Split(new char[1] { ',' });
List<int> numbers = new List<int>();
badVals = new List<string>();
for (var i = 0; i < split.Length; i++) {
if (int.TryParse(split[i], out parsed)) {
numbers.Add(parsed);
} else {
badVals.Add(split[i]);
}
}
return numbers.ToArray();
}
Proposed "fast" function ....
public static int[] SplitAsIntFast (this string delimitedString, char splitChar) {
string[] strArray = delimitedString.Split(splitChar);
int[] intArray = new int[strArray.Length];
if(delimitedString == null) {
return new int[0];
}
for (var i = 0; i < strArray.Length; i++) {
intArray[i] = int.Parse(strArray[i]);
}
return intArray;
}
Anyway, hope this helps someone.
It might be worth your while to check out this FileHelper and also CSV Reader
Hope they will help you...
Take care,
Tom
There is a good free library for parsing CSV files: FileHelpers
using FileHelpers;
// First declare the record class
[Delimitedrecord(";")]
public class SampleType
{
public string Field1;
public int Field2;
}
public void ReadExample()
{
FileHelperEngine engine = new FileHelperEngine(typeof(SampleType));
SampleType[] records;
records = (SampleType[]) engine.ReadFile("source.txt");
// Now "records" array contains all the records in the
// sourcefile and can be acceded like this:
int sum = records[0].Field2 + records[1].Field2;
}
public bool ParseAndCheck(string source,
out IList<int> goodItems, out IList<string> badItems)
{
goodItems = new List<int>();
badItems = new List<string>();
foreach (string item in source.Split(','))
{
int temp;
if (int.TryParse(item, out temp))
goodItems.Add(temp);
else
badItems.Add(item);
}
return (badItems.Count < 1);
}
In .NET 2.0 you could write
string test = "123,14.5,125,151,1.55,477,777,888";
bool isParsingOk = true;
int[] results = Array.ConvertAll<string,int>(test.Split(','),
new Converter<string,int>(
delegate(string num)
{
int r;
isParsingOk &= int.TryParse(num, out r);
return r;
}));
This is simple and I think works pretty well. It only return valid numbers:
static int[] SplitStringIntoInts(string list)
{
int dummy;
return (from x in list.Split(',')
where int.TryParse(x.ToString(), out dummy)
select int.Parse(x.ToString())).ToArray();
}