I want to read a file which has some characters in it and check the percentage of printable characters as well as the percentage of white spaces. This is my Python code which generates the input file:
import string
import random
array = list()
array = list(string.printable)
print(array)
external = ['\0','\a','\b','\v','\f','\e']
array = array + external
file = open("in.txt" , 'w')
for i in range (1000):
outputline = array[random.randrange(0,len(array)-1)]
file.write(outputline)
file.close()
I want my file to have both printable characters and whitespaces and other characters (which are not in these two groups). I do this in two ways:
Read the file, chunk by chunk, with the
read
system call in C:#include <stdio.h> #include <stdio.h> #include <ctype.h> #include <stdlib.h> #include <string.h> #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include <unistd.h> #include <errno.h> int main(int arg, char *argv[]) { char c ; char *data; int numOfWs = 0 ; int numOfPr = 0 ; int numberOfCharacters; int sizeOfBlock ; int nread ; int i=0; int k; float wsP = 0; float prP = 0; sizeOfBlock = atoi(argv[3]) ; data = malloc(sizeOfBlock*sizeof(char)); int fd = open(argv[2], O_RDONLY); while((nread = read(fd, data, sizeOfBlock)) > 0) { numberOfCharacters += nread ; for (i = 0; i < nread; ++i) { c = data[i] ; if(isprint(c)) numOfPr ++ ; else if(isspace (c)) numOfWs ++ ; } } wsP = (numOfWs / (float)numberOfCharacters)*100; prP = (numOfPr / (float)numberOfCharacters)*100 ; printf("%d printable characters out of %d bytes, %.2f%c\n", numOfPr,numberOfCharacters,prP,'%'); printf("%d whitespace characters out of %d bytes, %.2f%c\n", numOfWs,numberOfCharacters,wsP, '%'); exit(0); }
Copy the whole file into memory using
mmap()
and then start to read it from memory:#include <stdio.h> #include <stdlib.h> #include <fcntl.h> #include <unistd.h> #include <sys/types.h> #include <sys/mman.h> #include <sys/stat.h> #include <errno.h> #include <ctype.h> #include <string.h> int main(int arg, char *argv[]) { char c ; int i, numOfWs = 0, numOfPr = 0, numberOfCharacters = 0; char *data; float wsP = 0; float prP = 0; struct stat s; int fp = open("x.txt", O_RDWR); int status = fstat (fp, &s); int size = s.st_size; data = mmap((caddr_t)0, size, PROT_READ, MAP_SHARED, fp,0); for (i=0; i<size; i++){ char c; c = data[i]; if(isspace(c)) numOfWs ++; else if(isprint(c)) numOfPr ++; numberOfCharacters ++ ; } wsP = (numOfWs / (float)numberOfCharacters)*100; prP = (numOfPr / (float)numberOfCharacters)*100 ; printf("%d printable characters out of %d bytes, %.2f%c\n", numOfPr,numberOfCharacters,prP,'%'); printf("%d whitespace characters out of %d bytes, %.2f%c\n", numOfWs,numberOfCharacters,wsP, '%'); close(fp); exit(0); }
when I check these two ways with the same file (created by the Python code) I get two different answers for the percentages but the number of characters which are read each way are the same (1000).
This is my file generated from the Python code (I don't know what happens when I copied it here it contains lots of control characters):
el*2mlz_XyjP@%?Sw}Qo~.."tJ^~6,eN8+kq l)*N-1oupE
)coFKoA0\=|W'{Oezx~^p(B5ZJe!AdYb{Gflv&wwCf8}>3"v*>9\pW8PIs;qpX7RSk<9}&8B$u kNaq(mJK$N-!38?E%8-T,I1zC~0O=}FH*
x9x6Q%GT_C0j>7:@EG{N ?Eh$?18;Ncy[3 $'ikKs%:A].?e;i4`x"k!VD]}*pw
?\wE~Vix7^H~[26lsN?_GO$vz3M464S`+h=A(5@]q<&<+ hjmehAb-_3*3F8&#iM3p)6T`S9Q\yZwm$U`OHG}02{A)WcVzBR1h}H?qhF:P^-j5AQ1<7FD60j#B#}9Z=}2QReaYy|{Wv<^!yOC/7P}n*ZEPV2@8cU),=*5]]d a:3J;Y(?D?31$pcrquc#&PB;A[9lV+gJ%WZ6K~A|%^E_\3dM/?"y)BtUtG"3hf}W4,3DrXxTyl\UbWwCbMufqCNWx |hiJ\>43S6tCCS)rEo0.cz5PjgK0_AKN|8'g]byLp9AlrZDuK1OX,Csa}nu&i_p,#
Wyc{Q
LA\4:!WSq"ln|Pv.B;+N'h%O;tu(CgIh~OYIXCl+6~nSxBuybP nH:j;t'\vk&p}
,;3Ny#`Ug!rVbqExY| %BVCD^D~z:L(j8L! @ X4a!KBNCQ4z&3^9[O<fkM-qrOq5F/M*]yyU+-VLdZRtUu
a"=a b%c~GI|tcC/
P'/`t|hZ/2iHd94l"%;4-{)VUw%%3"e%IQ{RAX]NeMcsh&@LziT0)_T"2XADH&NYqa<6,$wdSp@LIMGA&,Gx1mj|t't?7=YtT77r<qi8;|tzi kOAi'dq%+g2 5hY?XTj{F)18.Vd!!$Q{D)}$7XxO)Vi%29*,P"cXkC,M|&brd&-DGF>V4 %N)a"VM+TQ$FI;YiL-0 YSxXgC@i~,o6/a7U2c"eGr\N7^B:'dytlOOS(iy\lhC7vnW,f o;vKUNa
Hg#u}W4N wUM
this is the result from mmap()
:
961 printable characters out of 1000 bytes, 96.10%
39 whitespace characters out of 1000 bytes, 3.90%
and this is the result from chunk reading
:
974 printable characters out of 1000 bytes, 97.40%
26 whitespace characters out of 1000 bytes, 2.60%
Why is the number of printable characters
different, but the file is the same in the two methods?
I think isspace()
doesn't accept as a white space in chunked mode and instead counts it as a
printable character
.
You didn't initialize
numberOfCharacters
to0
in the first program.That means that the value of
numberOfCharacters
is undetermined beforeis executed, this is a good reason to separate declaration from definition.