Can you improve the speed of my CSV Reader and Writer?

holly7787 · ‎12-22-2014

hi all, i'm trying to develop a CSV Writer and Reader. i have done a good work to implement the special character and quoting it, it's also support multi line value but it's incredibly slow.

can someone help to make it faster?

here it's how to use the writer

char *stringhe_sorgenti[10] = {0};

out = OpenFile(nfile, VAL_WRITE_ONLY, VAL_TRUNCATE, VAL_ASCII);

 for(i = 0; i < sizeof(stringhe_sorgenti)/sizeof(char*); i++){
 stringhe_sorgenti[i] = (char*)calloc(200, sizeof(char));
 }

sprintf(stringhe_sorgenti[0], "example1");
sprintf(stringhe_sorgenti[1], "example2");
scrivi_riga_csv(out, stringhe_sorgenti, sizeof(stringhe_sorgenti)/sizeof(char*), formato);

 for(i = 0; i < sizeof(stringhe_sorgenti)/sizeof(char*); i++){
 free(stringhe_sorgenti[i]);
 }

CloseFile(out);

here is the writer

void scrivi_riga_csv(int file_handle, char *stringa_sorgente[], int numero_stringhe, int formato)
{
	char delimitatore[2][2] = {{',', '\0'}, {';', '\0'}};
	char stringa_destinazione[1024] = {0};
	int index_destinazione = {0};
	int index_start = {0};
	int index_fine = {0};
	int errore = {0};
	int i = {0};
	//int k = {0};
	size_t lunghezza_stringa = {0};
	
	
	for(i = 0; i < numero_stringhe; i++){
		if(i != 0){
			stringa_destinazione[index_destinazione++] = delimitatore[formato][0];
		}
		index_start = 0;
		
		lunghezza_stringa = strlen(stringa_sorgente[i]);
			
		// se la stringa sorgente
		if( (FindPattern(stringa_sorgente[i], 0, lunghezza_stringa, delimitatore[formato], 0, 0) != -1) // contiene delimitatore
			|| (FindPattern(stringa_sorgente[i], 0, lunghezza_stringa, "\"", 0, 0) != -1) // contiene parentesi
			|| (FindPattern(stringa_sorgente[i], 0, lunghezza_stringa, "\n", 0, 0) != -1) // contiene a capo
			){
			// apro parentesi all'inizio
			stringa_destinazione[index_destinazione++] = '"';
						
			// metodo find pattern, piu' complesso ma piu' performante
			do{ index_fine = FindPattern(stringa_sorgente[i], index_start, lunghezza_stringa - index_start, "\"", 0, 0);
				if(index_fine != -1){
					index_fine++;
					// copio dall'inizio fino alle virgolette
					CopyString (stringa_destinazione, index_destinazione, stringa_sorgente[i], index_start, index_fine - index_start);
					index_destinazione += index_fine - index_start;
					// ne aggiungo una dopo
					stringa_destinazione[index_destinazione++] = '"';
					// aggiorno la posizione di start e riparto con il while
					index_start = index_fine;
				}
			}while(index_fine != -1);
			CopyString (stringa_destinazione, index_destinazione, stringa_sorgente[i], index_start, lunghezza_stringa - index_start);
			index_destinazione += strlen(stringa_sorgente[i]) - index_start;
			
			// alla fine della riga chiudo la parentesi
			stringa_destinazione[index_destinazione++] = '"';
		}
		else{
			// altrimenti la copio semplicemente e shifto l'indice della stringa di destinazione
			CopyString (stringa_destinazione, index_destinazione, stringa_sorgente[i], 0, lunghezza_stringa);
			index_destinazione += strlen(stringa_sorgente[i]);
		}
		memset(stringa_sorgente[i], 0, strlen(stringa_sorgente[i]));
	}
	
	errore = WriteLine (file_handle, stringa_destinazione, strlen(stringa_destinazione));
	if(errore == -1){
		errore = GetFmtIOError();
		MessagePopup("WriteLine -> WriteLine", GetFmtIOErrorString(errore));
	}
	return;
}

here how to read the file

char *stringhe_sorgenti[10] = {0};

 for(i = 0; i < sizeof(stringhe_sorgenti)/sizeof(char*); i++){
 stringhe_sorgenti[i] = (char*)calloc(200, sizeof(char));
 }

out = OpenFile(nomearchivio, VAL_READ_ONLY, VAL_OPEN_AS_IS, VAL_BINARY);

leggi_riga_csv(out, stringhe_sorgenti, sizeof(stringhe_sorgenti)/sizeof(char*), formato);
strcpy(intestazione.data, stringhe_sorgenti[1]);

 for(i = 0; i < sizeof(stringhe_sorgenti)/sizeof(char*); i++){
 free(stringhe_sorgenti[i]);
 }

CloseFile(out);

and here the reader

void leggi_riga_csv(int file_handle, char *stringa_destinazione[], int numero_stringhe, int formato)
{
	char delimitatore[2][2] = {{',', '\0'},
							   {';', '\0'}};
	char stringa_sorgente[1024] = {0};

	int stringa_in_corso = {0};
	int index_inizio_valore = {0};
	int index_doublequote = {0};
	int offset_stringa_destinazione = {0};
	
	size_t lunghezza_stringa = {0};
	int inquote = {0};
	int errore = {0};
	int i = {0};
	
	for(i = 0; i < numero_stringhe; i++){
		lunghezza_stringa = strlen(stringa_destinazione[i]);
		memset(stringa_destinazione[i], 0, lunghezza_stringa);
	}
	
	do{ memset(&stringa_sorgente, 0, sizeof(stringa_sorgente));
		errore = ReadLine(file_handle, stringa_sorgente, sizeof(stringa_sorgente) - 1);
		// If ReadLine reads no bytes because it has already reached the end of the file, it returns –2.
		// If an I/O error occurs, possibly because of a bad file handle, ReadLine returns –1.
		// You can use GetFmtIOError to get more information about the type of error that occurred.
		// A value of 0 indicates that ReadLine read an empty line.
		if(errore == -1){
			errore = GetFmtIOError();
			MessagePopup("leggi_riga_csv -> ReadLine", GetFmtIOErrorString(errore));
			return;
		}
		else if(errore == -2){
			errore = GetFmtIOError();
			MessagePopup("leggi_riga_csv -> ReadLine", "already reached the end of the file");
			return;
		}
		else{
		
			lunghezza_stringa = errore;
			index_inizio_valore = 0;
		
			// metodo find pattern, piu' complesso ma piu' performante
			for(i = 0; i <= lunghezza_stringa; i++){
			
				// se come primo carattere ho una " allora e' una stringa speciale
				if(inquote == 0){
					if(stringa_sorgente[i] == '\"'){
						inquote = 1;
						index_inizio_valore = ++i;
					}
					else{
						// altrimenti cerco il delimitatore senza il ciclo for
						i = FindPattern(stringa_sorgente, i, lunghezza_stringa - index_inizio_valore, delimitatore[formato], 0, 0);
						if(i == -1){
							// se non lo trovo ho finito la riga
							i = lunghezza_stringa;
							if(stringa_sorgente[i - 1] == '\r'){
								i--;
							}
						}
						if(stringa_in_corso < numero_stringhe){
							CopyString (stringa_destinazione[stringa_in_corso], 0, stringa_sorgente, index_inizio_valore, i - index_inizio_valore);
						}
						offset_stringa_destinazione = 0;
						stringa_in_corso++;
						if(stringa_sorgente[i] == '\r'){
							i++;
						}
						index_inizio_valore = i + 1;
					}
				}
				
				if(inquote == 1){
					// se sono nelle parentesi cerco le virgolette
					i = 1 + FindPattern(stringa_sorgente, i, lunghezza_stringa - index_inizio_valore, "\"", 0, 0);
					if(i == 0){
						if(stringa_sorgente[lunghezza_stringa - 1] == '\r'){
							lunghezza_stringa--;
						}
						// se non le trovo ho finito la riga, esco dal ciclo for
						break;
					}
					// se incontro una doppia parentesi salto avanti
					else if(stringa_sorgente[i] == '\"'){
						continue;
					}
					// !!!! fondamentale non cambiare l'ordine di questi else if !!!!!
					// se incontro una parentesi seguita dal delimitatore
					// o se incontro una parentesi seguita dal terminatore
					// \r = CR = 0x0D = 13
					// \n = LF = 0x0A = 10
					// a capo = CR + LF
					else if( (stringa_sorgente[i] == delimitatore[formato][0])
							 || (stringa_sorgente[i] == '\r')
							 || (stringa_sorgente[i] == '\0')
							 ){
						// salvo il valore
						inquote = 0;
						if(stringa_in_corso < numero_stringhe){
							CopyString (stringa_destinazione[stringa_in_corso], offset_stringa_destinazione, stringa_sorgente, index_inizio_valore, i - 1 - index_inizio_valore);
						}
						offset_stringa_destinazione = 0;
						stringa_in_corso++;
						if(stringa_sorgente[i] == '\r'){
							i++;
						}
						index_inizio_valore = i;
					}
				}
			}
		
			// se sono andato a capo scrivo fino a dove sono e poi procedo con la nuova riga
			if(inquote){
				if(stringa_in_corso < numero_stringhe){
					CopyString (stringa_destinazione[stringa_in_corso], offset_stringa_destinazione, stringa_sorgente, index_inizio_valore, lunghezza_stringa - index_inizio_valore);
					strcat(stringa_destinazione[stringa_in_corso], "\n");
				}
				offset_stringa_destinazione += lunghezza_stringa - index_inizio_valore;
				offset_stringa_destinazione++;
			}
		}
	}while(inquote == 1);
	
	// elimino le doppie parentesi
	for(i = 0; i < numero_stringhe; i++){
		index_doublequote = 0;
		do{ lunghezza_stringa = strlen(stringa_destinazione[i]);
			index_doublequote = FindPattern(stringa_destinazione[i], index_doublequote, lunghezza_stringa - index_doublequote, "\"\"", 0, 0); // contiene doppia parentesi
			if(index_doublequote != -1){
				index_doublequote++;
				memmove (stringa_destinazione[i] + index_doublequote, stringa_destinazione[i] + index_doublequote + 1, lunghezza_stringa - index_doublequote);
			}
		}while(index_doublequote != -1);
	}
	
	return;
}

Davide Vittorio G. - TLGB S.R.L.
Italian SW Developer

RobertoBozzolo · ‎12-22-2014

Hello, the code is long and difficult to understant without knowing the format to write / read: can you add a small sample file to help us understand what's happening?

Also, is the problem in writing or reading?

How many lines do you expect to treat each time and how much time does the code spend to read / write them?

Proud to use LW/CVI from 3.1 on.

My contributions to the Developer Community
________________________________________
If I have helped you, why not giving me a kudos?

holly7787 · ‎12-22-2014

the format is CSV, i try to explain better what i'm doing.

our client asked to save acquisition data with header description in an excel readable format, i've decided to use .CSV and not .TDM because it's a simple txt file and we never used .TMD but i will propose to use it.

after some research on the internet i've found nothing to handle .CSV in CVI except from this csv_parse but i've found it difficult to be maintained so i've write it by my own hand.

i've written two example of how to use my function to read or write and i've copyed my function used to read and write.

in the write function i check with FindPattern if the string to be write contain some special character, if i find this i have to quote the string to respect the standard RFC4180 and if i find a quote i have to double it. aftere i've done this check i write the line in the file.

in the read function, that is more complicated, i:

check if the first character is a quote.
if it's not i copy the string until the delimitier or until the end of the line.
if it is i have a string with special character inside so:

i find the first quote in the string. when i've found i check if it's follwed by another quote. this means that in the starting message i was writing a single quote.
if it's not followed by another quote but it's followed by a delimiter or a carriage return i've finished the special line.
if i don't find it it means that the special quote have a carriage return inside and i have to check the next line. before checking the next line i save this in my string.

after this loop i check in every string if i have a double quote and i delete one.

the main problem is in the speed of this, i'm acquiring data at 1000 S/s with 8 active channel for 60 second so i have 480000 data to be stored, divided in 60.000 row and 8 column. to read a file like that my pc stay "locked" for 15 second or more.

i've tried to use the arraytofile function and it's extremly fast and i can also put header because the function can start from the last position in the file but the filetoarray function start from the beginning and i cannot read the header correctly. also if i'm using the european CSV with semicolon as delimiter with arraytofile i cannot select the semicolon but only the coma

Davide Vittorio G. - TLGB S.R.L.
Italian SW Developer

stef_fr · ‎01-05-2015

I got a similar issue when developing an XML parser/reader. When working with big text files you will gain a lot of time by reducing the number Read/Write operations to the disk.

Instead of using ReadLine/WriteLine functions 60000 times for each file use fread/fwrite with an allocated buffer able to contain many lines (or all the file if your computer memory allows it) and then update you CSV functions to perform conversion from buffer.

It should dramatically improve your application.

Regards

Stéphane

Labwindows/CVI user since version 4.0

RobertoBozzolo · ‎01-07-2015

Reducing disk access can surely improve the speed of your code.

Another hint is to use the technique explained here by NickB: basically he has greatly improved the efficiency of a file reader by reading the text file as a binary one, reading it in a single pass and performing the line split entirely in memory: not a trivial task but very promising according to the figures posted by Nick. Reading the full thread can be an interesting study case.

Proud to use LW/CVI from 3.1 on.

My contributions to the Developer Community
________________________________________
If I have helped you, why not giving me a kudos?

holly7787 · ‎01-07-2015

i have already tried to read all the file in a buffer and then process it but it's not so fast as i expected.

if you want i can post my code.

when i have some time i will try the NickB technique suggested by RobertoBozzolo, i already have a linecounter that is really fast, i only have to add a buffer for each line.

thanks, i will update this thread when the work is done

/// HIFN Voglio scoprire quante righe ho nel file.
/// HIFN Puo' essere usato per fare la progress bar.
/// HIRET restituisco il nuemro di righe nel file.
/// HIPAR nomearchivio/passo il percorso completo del file compreso di nome ed estensione.
int contarighe(char *nomearchivio)
{
	int out = {0};				// handle del file
	int contatore_righe = {0};	// risultato della funzione
	int shift = {0};			// registro di shift per la ricerca di CR nell'array
	ssize_t file_size = {0};	// variabile in cui inserire la dimensione del file
	char *array = {0};			// array di appoggio che verra' usato per la lettura del file

	if (GetFileInfo(nomearchivio, &file_size) != 1) {
		MessagePopup("contarighe -> GetFileInfo", GetFmtIOErrorString(GetFmtIOError()));
		return GetFmtIOError();
	}
	else {
		if (file_size >= INT_MAX) { // limite lettura massimo 2GB
			MessagePopup("contarighe -> GetFileInfo", GetFmtIOErrorString(FmtIOInsuffMemErr));
			return FmtIOInsuffMemErr;
		}
		else {
			out = OpenFile(nomearchivio, VAL_READ_ONLY, 0, VAL_BINARY);
			if (out == -1) {
				MessagePopup("contarighe -> OpenFile", GetFmtIOErrorString(GetFmtIOError()));
				return GetFmtIOError();
			}
			else {
				array = calloc(file_size + 1, sizeof(char));
				if (array == NULL) { // aggiungo un carattere per il terminatore
					MessagePopup("contarighe -> calloc", "Not enough memory");
				}
				else {
					ReadFile(out, array, file_size);
					CloseFile(out);
					while ((shift = FindPattern(array, shift, (file_size + 1) - shift, "\n", 0, 0) + 1) != 0) {
						contatore_righe++;
					}
					free(array);
				}
			}
		}
	}
	return contatore_righe;
}

Davide Vittorio G. - TLGB S.R.L.
Italian SW Developer

holly7787 · ‎01-12-2015

i have tried to use your suggestion but i'm not getting better result than mine.

attached a sample project with the last reader version with 2 example file.

if you want to get a try i've done it with CVI 2012 SP1 but i think it will work on every version

Davide Vittorio G. - TLGB S.R.L.
Italian SW Developer

stef_fr · ‎01-12-2015

Hi,

I have modified your example (project in attachment). I have added two indicators to display the time spend (in seconds) in each version of the read algorithm. T1 is yours and T2 is a version that parse a CSV line with a state machine that doesn't use FindPattern, strlen and CopyString to avoid too much useles iterations.

On my computer using the biggest file I obtain the following results :

in debug mode ==> T1=35.1s and T2=6.2s

in release mode ==> T1=0.494s and T2=0.123s

Result in release mode is already good with your actual algorithm, you were speaking about 15s to perform a read operation, was it in debug mode ?

Regards

Stef

Labwindows/CVI user since version 4.0

holly7787 · ‎01-12-2015

yes i was in debug with profiling active.

Thank for your version, i'm going to try it right now!

Davide Vittorio G. - TLGB S.R.L.
Italian SW Developer

stef_fr · ‎01-12-2015

It is better to avoid using debug mode when you know already the function you have to optimize. Debug mode is very slow specially when your code performs a lot of dynamic memory allocation because it tracks them.

Labwindows/CVI user since version 4.0

LabWindows/CVI

Can you improve the speed of my CSV Reader and Writer?

Can you improve the speed of my CSV Reader and Writer?

Rif.: Can you improve the speed of my CSV Reader and Writer?

Rif.: Can you improve the speed of my CSV Reader and Writer?

Rif.: Can you improve the speed of my CSV Reader and Writer?

Rif.: Can you improve the speed of my CSV Reader and Writer?

Rif.: Can you improve the speed of my CSV Reader and Writer?

Rif.: Can you improve the speed of my CSV Reader and Writer?

Rif.: Can you improve the speed of my CSV Reader and Writer?

Rif.: Can you improve the speed of my CSV Reader and Writer?

Rif.: Can you improve the speed of my CSV Reader and Writer?