2 different output from an input file iterated from mmap and read with a chunk in C

163 Views Asked by At

I want to read a file which has some characters in it and check the percentage of printable characters as well as the percentage of white spaces. This is my Python code which generates the input file:

import string
import random
array = list()
array = list(string.printable)
print(array)
external = ['\0','\a','\b','\v','\f','\e']
array = array + external
file = open("in.txt" , 'w')
for i in range (1000):
        outputline = array[random.randrange(0,len(array)-1)]
        file.write(outputline)
file.close()

I want my file to have both printable characters and whitespaces and other characters (which are not in these two groups). I do this in two ways:

  1. Read the file, chunk by chunk, with the read system call in C:

    #include <stdio.h>
    #include <stdio.h>
    #include <ctype.h>
    #include <stdlib.h>
    #include <string.h>
    #include <sys/types.h>
    #include <sys/stat.h>
    #include <fcntl.h>
    #include <unistd.h>
    #include <errno.h>
    int main(int arg, char *argv[])
    {
        char c ;
        char *data;
        int numOfWs = 0 ;
        int numOfPr = 0 ;
        int numberOfCharacters;
        int sizeOfBlock ;
        int nread ;
        int i=0;
        int k;
        float wsP = 0;
        float prP = 0;
        sizeOfBlock = atoi(argv[3]) ;
        data = malloc(sizeOfBlock*sizeof(char));
        int fd = open(argv[2], O_RDONLY);
        while((nread = read(fd, data, sizeOfBlock)) > 0)
        {
            numberOfCharacters += nread ;
            for (i = 0; i < nread; ++i)
            {
                c = data[i] ;
                if(isprint(c))
                    numOfPr ++ ;
                else if(isspace (c))
                    numOfWs ++ ;
            }
        }
        wsP = (numOfWs / (float)numberOfCharacters)*100;
        prP = (numOfPr / (float)numberOfCharacters)*100 ;
        printf("%d printable characters out of %d bytes, %.2f%c\n", numOfPr,numberOfCharacters,prP,'%');
        printf("%d whitespace characters out of %d bytes, %.2f%c\n", numOfWs,numberOfCharacters,wsP, '%');
        exit(0);
    }
    
  2. Copy the whole file into memory using mmap() and then start to read it from memory:

    #include <stdio.h>
    #include <stdlib.h>
    #include <fcntl.h>
    #include <unistd.h>
    #include <sys/types.h>
    #include <sys/mman.h>
    #include <sys/stat.h>
    #include <errno.h>
    #include <ctype.h>
    #include <string.h>
    int main(int arg, char *argv[])
    {
        char c ;
        int i, numOfWs = 0, numOfPr = 0, numberOfCharacters = 0;
        char *data;
        float wsP = 0;
        float prP = 0;
        struct stat s;
        int fp = open("x.txt", O_RDWR);
        int status = fstat (fp, &s);
        int size = s.st_size;
        data = mmap((caddr_t)0, size, PROT_READ, MAP_SHARED, fp,0);
        for (i=0; i<size; i++){
            char c;
            c = data[i];
            if(isspace(c))
                numOfWs ++;
            else if(isprint(c))
                numOfPr ++;
            numberOfCharacters ++ ;
        }
        wsP = (numOfWs / (float)numberOfCharacters)*100;
        prP = (numOfPr / (float)numberOfCharacters)*100 ;
        printf("%d printable characters out of %d bytes, %.2f%c\n", numOfPr,numberOfCharacters,prP,'%');
        printf("%d whitespace characters out of %d bytes, %.2f%c\n", numOfWs,numberOfCharacters,wsP, '%');
        close(fp);
        exit(0);
    }
    

when I check these two ways with the same file (created by the Python code) I get two different answers for the percentages but the number of characters which are read each way are the same (1000).

This is my file generated from the Python code (I don't know what happens when I copied it here it contains lots of control characters):

el*2mlz_XyjP@%?Sw}Qo~.."tJ^~6,eN8+kq l)*N-1oupE
)coFKoA0\=|W'{Oezx~^p(B5ZJe!AdYb{Gflv&wwCf8}>3"v*>9\pW8PIs;qpX7RSk<9}&8B$u kNaq(mJK$N-!38?E%8-T,I1zC~0O=}FH*
x9x6Q%GT_C0j>7:@EG{N ?Eh$?18;Ncy[3 $'ikKs%:A].?e;i4`x"k!VD]}*pw
?\wE~Vix7^H~[26lsN?_GO$vz3M464S`+h=A(5@]q<&<+ hjmehAb-_3*3F8&#iM3p)6T`S9Q\yZwm$U`OHG}02{A)WcVzBR1h}H?qhF:P^-j5AQ1<7FD60j#B#}9Z=}2QReaYy|{Wv<^!yOC/7P}n*ZEPV2@8cU),=*5]]d a:3J;Y(?D?31$pcrquc#&PB;A[9lV+gJ%WZ6K~A|%^E_\3dM/?"y)BtUtG"3hf}W4,3DrXxTyl\UbWwCbMufqCNWx |hiJ\>43S6tCCS)rEo0.cz5PjgK0_AKN|8'g]byLp9AlrZDuK1OX,Csa}nu&i_p,#
Wyc{Q
LA\4:!WSq"ln|Pv.B;+N'h%O;tu(CgIh~OYIXCl+6~nSxBuybP nH:j;t'\vk&p}
,;3Ny#`Ug!rVbqExY|  %BVCD^D~z:L(j8L!    @   X4a!KBNCQ4z&3^9[O<fkM-qrOq5F/M*]yyU+-VLdZRtUu
a"=a b%c~GI|tcC/
P'/`t|hZ/2iHd94l"%;4-{)VUw%%3"e%IQ{RAX]NeMcsh&@LziT0)_T"2XADH&NYqa<6,$wdSp@LIMGA&,Gx1mj|t't?7=YtT77r<qi8;|tzi kOAi'dq%+g2   5hY?XTj{F)18.Vd!!$Q{D)}$7XxO)Vi%29*,P"cXkC,M|&brd&-DGF>V4 %N)a"VM+TQ$FI;YiL-0 YSxXgC@i~,o6/a7U2c"eGr\N7^B:'dytlOOS(iy\lhC7vnW,f o;vKUNa
Hg#u}W4N wUM

this is the result from mmap():

961 printable characters out of 1000 bytes, 96.10%
39 whitespace characters out of 1000 bytes, 3.90%

and this is the result from chunk reading:

974 printable characters out of 1000 bytes, 97.40%
26 whitespace characters out of 1000 bytes, 2.60%

Why is the number of printable characters different, but the file is the same in the two methods?

I think isspace() doesn't accept as a white space in chunked mode and instead counts it as a printable character.

2

There are 2 best solutions below

3
On

You didn't initialize numberOfCharacters to 0 in the first program.

That means that the value of numberOfCharacters is undetermined before

numberOfCharacters += nread ;

is executed, this is a good reason to separate declaration from definition.

1
On

the problem was that spaces are printable in the isprint()i change the order of isspace() and isprint() and use if()...else if() it becomes true.