I have programmed an Aho-Corasick algorithm with a transition table that searches for a set of words in a text and displays the number of occurrences by using malloc(), but I am encountering this error:
Segmentation fault (core dumped)
Here is the algorithm that I have programmed
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define ALPHABET_SIZE 70
#define MAX_WORD_LENGTH 100000
typedef struct Element Element;
struct Element {
int nombre;
Element *suivant;
};
typedef struct File File;
struct File {
Element *premier;
Element *dernier;
};
File *creerfile() {
File *file = (File *)malloc(sizeof(File));
if (file == NULL) {
perror("Erreur d'allocation pour la file");
exit(EXIT_FAILURE);
}
file->premier = NULL;
file->dernier = NULL;
return file;
}
int estVide(File *file) {
return file->premier == NULL;
}
void enfiler(File *file, int element) {
Element *nouveau = malloc(sizeof(*nouveau));
if (nouveau == NULL) {
perror("Erreur d'allocation pour un élément de la file");
exit(EXIT_FAILURE);
}
nouveau->nombre = element;
nouveau->suivant = NULL;
if (file->premier != NULL) {
file->dernier->suivant = nouveau;
} else {
file->premier = nouveau;
}
file->dernier = nouveau;
}
int defiler(File *file) {
if (file == NULL || file->premier == NULL) {
perror("Erreur de défiler : file vide");
exit(EXIT_FAILURE);
}
int nombreDefile = file->premier->nombre;
Element *elementDefile = file->premier;
file->premier = elementDefile->suivant;
free(elementDefile);
if (file->premier == NULL) {
file->dernier = NULL;
}
return nombreDefile;
}
struct _trie {
int maxNode;
int nextNode;
int **transition;
char *finale;
int *supp;
};
typedef struct _trie Trie;
void initialisationTrie(Trie *trie, int maxNode) {
trie->maxNode = maxNode;
trie->nextNode = 1;
trie->transition = (int **)malloc(maxNode * sizeof(int *));
if (trie->transition == NULL) {
perror("Erreur d'allocation pour le tableau de transitions");
exit(EXIT_FAILURE);
}
for (int i = 0; i < maxNode; ++i) {
trie->transition[i] = (int *)malloc(ALPHABET_SIZE * sizeof(int));
if (trie->transition[i] == NULL) {
perror("Erreur d'allocation pour une ligne du tableau de transitions");
exit(EXIT_FAILURE);
}
for (int j = 0; j < ALPHABET_SIZE; ++j) {
trie->transition[i][j] = 0;
}
}
trie->finale = (char *)malloc(maxNode * sizeof(char));
if (trie->finale == NULL) {
perror("Erreur d'allocation pour le tableau finale");
exit(EXIT_FAILURE);
}
for (int i = 0; i < maxNode; ++i) {
trie->finale[i] = 0;
}
trie->supp = (int *)malloc(maxNode * sizeof(int));
if (trie->supp == NULL) {
perror("Erreur d'allocation pour le tableau supp");
exit(EXIT_FAILURE);
}
for (int i = 0; i < maxNode; ++i) {
trie->supp[i] = 0;
}
}
void ajoutmot(Trie *trie, char mot[MAX_WORD_LENGTH]) {
int courant = 0;
for (int i = 0; i < strlen(mot); ++i) {
char c = mot[i];
int index;
if (c >= 'a' && c <= 'z') {
index = c - 'a';
} else {
// Caractère spécial, traitez-le comme une lettre
index = c - 'A' + 26; // Ajoutez 26 pour l'ajustement
}
if (trie->transition[courant][index] == 0) {
trie->transition[courant][index] = trie->nextNode;
trie->nextNode++;
}
courant = trie->transition[courant][index];
}
trie->finale[courant] = 1;
}
void constructionsuppleants(Trie *trie) {
File *file = creerfile();
for (int i = 0; i < ALPHABET_SIZE; ++i) {
if (trie->transition[0][i] != 0) {
enfiler(file, trie->transition[0][i]);
trie->supp[trie->transition[0][i]] = 0;
}
}
while (!estVide(file)) {
int r = defiler(file);
for (int i = 0; i < ALPHABET_SIZE; ++i) {
int s = trie->transition[r][i];
if (s != 0) {
enfiler(file, s);
int etat = trie->supp[r];
while (trie->transition[etat][i] == 0 && etat != 0) {
etat = trie->supp[etat];
}
trie->supp[s] = trie->transition[etat][i] != 0 ? trie->transition[etat][i] : 0;
}
}
}
free(file);
}
int AhoCorrasick(Trie *trie, char *text) {
int cmpt = 0;
int etat = 0;
for (int i = 0; text[i] != '\0'; ++i) {
char c = text[i];
int index;
if (c >= 'a' && c <= 'z') {
index = c - 'a';
} else {
// Traitez le caractère spécial comme une lettre
index = c - 'a' + 26;
}
while (trie->transition[etat][index] == 0 && etat != 0) {
etat = trie->supp[etat];
}
etat = trie->transition[etat][index] != 0 ? trie->transition[etat][index] : 0;
int etatTemporaire = etat;
while (etatTemporaire != 0) {
if (trie->finale[etatTemporaire] == 1) {
cmpt++;
}
etatTemporaire = trie->supp[etatTemporaire];
}
}
return cmpt;
}
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "aho_corrasick_matrice.c"
#include "aho_corrasick_matrice.h"
#define ALPHABET_SIZE 70
#define BUFFER_SIZE 4096
#define MAX_WORD_LENGTH 100000
int main(int argc, char *argv[]) {
if (argc != 3) {
fprintf(stderr, "Usage: %s mots.txt texte.txt\n", argv[0]);
exit(EXIT_FAILURE);
}
Trie trie;
long int maxNode = 500000;
initialisationTrie(&trie, maxNode);
FILE *motsFile = fopen(argv[1], "r");
if (motsFile == NULL) {
fprintf(stderr, "Erreur lors de l'ouverture du fichier %s\n", argv[1]);
exit(EXIT_FAILURE);
}
char mot[MAX_WORD_LENGTH];
while (fscanf(motsFile, "%s", mot) == 1) {
ajoutmot(&trie, mot);
}
fclose(motsFile);
constructionsuppleants(&trie);
FILE *texteFile = fopen(argv[2], "r");
if (texteFile == NULL) {
fprintf(stderr, "Erreur lors de l'ouverture du fichier %s\n", argv[2]);
exit(EXIT_FAILURE);
}
char buffer[BUFFER_SIZE];
int occurrences = 0;
while (fgets(buffer, sizeof(buffer), texteFile) != NULL) {
occurrences += AhoCorrasick(&trie, buffer);
}
printf("Nombre d'occurrences est : %d\n", occurrences);
fclose(texteFile);
free(trie.finale);
for (int i = 0; i < maxNode; ++i) {
free(trie.transition[i]);
}
free(trie.transition);
free(trie.supp);
return 0;
}
For searching sets in a text with an alphabet size of 20, it works well, but with an alphabet size of 70, it doesn't work, and I get this segmentation fault error.
There are at least these problems:
you convert the characters into index values that can be larger than
ALPHABET_SIZE, invoking undefined behavior when you index into the transitions tables.the initial value
maxNodeof500000might be too small if the dictionary is large. You do not check iftrie->nextNodestays inside the array boundaries inajoutmot(ie:trie->nextNode < trie->maxNode), causing undefined behavior when writing totrie->transition[courant][index]ortrie->finale[courant]withcourant >= trie->maxNode.Here is a modified version of
ajoutmotandAhoCorrasickusing a normalized index value:This might not fix all the problems, but the segmentation faults should be prevented. Note that a segmentation fault is an easy bug to fix as the debugger will point to the offending code immediately.