classification of a text in a corpus with five themes and each theme contains 10 items in perl
for exemple:
i have:
1 theme food
10 articles about food
2 theme politic
10 articles about politic
...etc
and i give him a new text and he classify this text on the appropriate theme
using:
- K-means or
- decision tree or
- segmentation or
-naive bayesien
Pleeeeease help me

Recommended Answers

All 9 Replies

Hi,
What have you tried, and where are you having issues with your script? What desired output are you expecting?

#!/usr/bin/perl

use strict; use warnings;

# Lancement des traitements sur le corpus

sub Run 
{
   my ($rep) = @_;
   my $DIRarticles = "$rep/art";
   my $DIRclean= "$rep/clean";
   my $DIRtag= "$rep/tag";
   my $DIRvect= "$rep/vect";
   my $nbfiles= 0;

   #On recupere tout les fichiers contenu dans le repertoire
   opendir (REP, $DIRarticles) or die ("CAUTION : Impossible d'ouvrir le
repertoire");

   #On les stocke dans dans un tableau
   my @articles = readdir (REP);
   closedir (REP);
   #Ensuite pour chaque fichier on extrait les candidats termes
   print ("PRE TRAITEMENT sur le REPERTOIRE : $DIRarticles en cours \n");
   foreach my $entree (@articles)
   {
      if (not($entree eq ".") and not($entree eq ".."))
      {
          print ("CLEANING $DIRarticles,$entree ...\n");
          Clean ("$DIRarticles","$entree","$DIRclean");
          print "TAGGING $DIRclean,$entree ...\n";
          Tagger ("$DIRclean","$entree","$DIRtag");
          $nbfiles ++;
      }
   }
   print ("\t$nbfiles dans $DIRarticles ont أ©tأ© traitأ©s \n\n");
}

#Operation de nettoyage
sub Clean
{
   my ($dir2clean,$fichier,$rep2clean) = @_;
   #Ouverture du fichier
   open (F,"$dir2clean/$fichier");
   #Fichier de sortie
   open (Sortie,"> $rep2clean/$fichier");
   #On parcours le texte
   while (my $chaine = <F>) 
   {
       $chaine =~ s/\./ \. /g;
       $chaine =~ s/\,/ \. /g;
       $chaine =~ s/\:/ \. /g;
       $chaine =~ s/\;/ \. /g;
       $chaine =~ s/\'/ \. /g;
       $chaine =~ s/\"/ \. /g;
       $chaine =~ s/\?/ \. /g;
       $chaine =~ s/\!/ \. /g;
       $chaine =~ s/\// \. /g;




       $chaine =~ s/أ€/\$A/g;
       $chaine =~ s/أپ/\\A/g;
       $chaine =~ s/أ‚/#A/g;
       $chaine =~ s/أ„/~A/g;
       $chaine =~ s/أ /\$a/g;
       $chaine =~ s/أ،/\\A/g;
       $chaine =~ s/أ¢/#A/g;
       $chaine =~ s/أ¤/~a/g;
       $chaine =~ s/أ’/\$O/g;
       $chaine =~ s/أ“/\\O/g;
       $chaine =~ s/أ”/#O/g;
       $chaine =~ s/أ–/~O/g;
       $chaine =~ s/أ²/\$o/g;
       $chaine =~ s/أ³/\\o/g;
       $chaine =~ s/أ´/#o/g;
       $chaine =~ s/أ¶/~o/g;
       $chaine =~ s/أˆ/\$E/g;
       $chaine =~ s/أ‰/\\E/g;
       $chaine =~ s/أٹ/#E/g;
       $chaine =~ s/أ‹/~E/g;
       $chaine =~ s/أ¨/\$e/g;
       $chaine =~ s/أ©/\\e/g;
       $chaine =~ s/أھ/#e/g;
       $chaine =~ s/أ«/~e/g;
       $chaine =~ s/أŒ/\$I/g;
       $chaine =~ s/أچ/\\I/g;
       $chaine =~ s/أژ/#I/g;
       $chaine =~ s/أڈ/~I/g;
       $chaine =~ s/أ¬/\$i/g;
       $chaine =~ s/أ­/\\i/g;
       $chaine =~ s/أ®/#i/g;
       $chaine =~ s/أ¯/~i/g;
       $chaine =~ s/أ™/\$U/g;
       $chaine =~ s/أڑ/\\U/g;
       $chaine =~ s/أ›/#U/g;
       $chaine =~ s/أœ/~U/g;
       $chaine =~ s/أ¹/\$u/g;
       $chaine =~ s/أ؛/\\u/g;
       $chaine =~ s/أ»/#u/g;
       $chaine =~ s/أ¼/~u/g;
       $chaine =~ s/أ؟/~y/g;
       $chaine =~ s/أ‡/\\C/g;
       $chaine =~ s/أ§/\\c/g;
       print Sortie $chaine;
   }
   #Fermeture des fichiers
   close (F);
   close (Sortie);
}


#

sub makeChaine
{
     my ($dir2open,$fichier) = @_;
     my $chaine = "";
     #Ouverture du fichier
     open (F,"$dir2open/$fichier");
     while (my $ligne = <F>) 
     {
           $chaine .= $ligne;
     }
     close(F);
     return ($chaine);
}

#
sub appel_Sygmart 
{
     my $chaine = @_;
     require LWP::UserAgent;
     my $ua = LWP::UserAgent->new;
     $ua->timeout(10);
     $ua->env_proxy;
     my $response = $ua->post(' http://www.lirmm.fr/~chauche/cgi-
bin/runsygmart.cgi', {
              Services => 'lslm',
              FormeSortie => 'lslmsc',
              filtre => 'tout3',
              texte_entree => $chaine,
              Charset => 'utf8',
            });
     if ($response->is_success) 
     {
         return $response->content; # or whatever
     }
     else 
     {
         return $response->status_line;
     }
}



#
sub Tagger
{
     my ($dir2open,$fichier,$rep2tag) = @_;
     my $texte = makeChaine("$dir2open","$fichier");
     my $textetag;
     #Fichier de sortie
     open (Sortie,"> $rep2tag/$fichier");
     $textetag = appel_Sygmart($texte);
     print Sortie $textetag;
     close (Sortie);
}


Run("../article/Donquichote");
Run("../article/ParisElection");
Run("../article/SarkozyCarla");
Run("../article/SkiGrange");
Run("../article/Tf1DaylimotionYoutube");

Hello 2teez
That is my script
When i execute it
he show an error in function Run i don t know how!!!
Can you help me please

Can you help me please

Yes, I really want to help, but unless you could show the sample of your input, and how your desired output should look like, it would be difficult to tell.

Moreover, I love French language, but I am a beginner in it and it has been long I checked up my class-work on french language. So, please could you show your code in English so that one can follow, especially your comments.

WIll be waiting to hear from you soon.

#!/usr/bin/perl
use strict; use warnings;

# Launch treatments on the corpus

sub Run 
{
   my ($rep) = @_;     #repertory
   my $DIRarticles = "$rep/art";
   my $DIRclean= "$rep/clean";
   my $DIRtag= "$rep/tag";
   my $DIRvect= "$rep/vect";
   my $nbfiles= 0;

   #We Recovered all the files contained in the directory
   opendir (REP, $DIRarticles) or die ("CAUTION : Impossible to open the directory");

   #We Stores them in an array
   my @articles = readdir (REP);
   closedir (REP);
   # Then for each file is extracted candidate terms
   print ("PRE TRAITEMENT  : $DIRarticles en cours \n"); #pre tritment on the directory
   foreach my $entree (@articles)
   {
      if (not($entree eq ".") and not($entree eq ".."))
      {
          print ("CLEANING $DIRarticles,$entree ...\n");
          Clean ("$DIRarticles","$entree","$DIRclean");
          print "TAGGING $DIRclean,$entree ...\n";
          Tagger ("$DIRclean","$entree","$DIRtag");
          $nbfiles ++;
      }
   }
   print ("\t$nbfiles dans $DIRarticles ont أ©tأ© traitأ©s \n\n");
}

# Cleaning Operation
sub Clean
{
   my ($dir2clean,$fichier,$rep2clean) = @_;
   # opening file
   open (F,"$dir2clean/$fichier");
   #output file
   open (Sortie,"> $rep2clean/$fichier");
   # We travel the text    chaine-> caracter
   while (my $chaine = <F>) 
   {
       $chaine =~ s/\./ \. /g;
       $chaine =~ s/\,/ \. /g;
       $chaine =~ s/\:/ \. /g;
       $chaine =~ s/\;/ \. /g;
       $chaine =~ s/\'/ \. /g;
       $chaine =~ s/\"/ \. /g;
       $chaine =~ s/\?/ \. /g;
       $chaine =~ s/\!/ \. /g;
       $chaine =~ s/\// \. /g;




       $chaine =~ s/أ€/\$A/g;
       $chaine =~ s/أپ/\\A/g;
       $chaine =~ s/أ‚/#A/g;
       $chaine =~ s/أ„/~A/g;
       $chaine =~ s/أ /\$a/g;
       $chaine =~ s/أ،/\\A/g;
       $chaine =~ s/أ¢/#A/g;
       $chaine =~ s/أ¤/~a/g;
       $chaine =~ s/أ’/\$O/g;
       $chaine =~ s/أ“/\\O/g;
       $chaine =~ s/أ”/#O/g;
       $chaine =~ s/أ–/~O/g;
       $chaine =~ s/أ²/\$o/g;
       $chaine =~ s/أ³/\\o/g;
       $chaine =~ s/أ´/#o/g;
       $chaine =~ s/أ¶/~o/g;
       $chaine =~ s/أˆ/\$E/g;
       $chaine =~ s/أ‰/\\E/g;
       $chaine =~ s/أٹ/#E/g;
       $chaine =~ s/أ‹/~E/g;
       $chaine =~ s/أ¨/\$e/g;
       $chaine =~ s/أ©/\\e/g;
       $chaine =~ s/أھ/#e/g;
       $chaine =~ s/أ«/~e/g;
       $chaine =~ s/أŒ/\$I/g;
       $chaine =~ s/أچ/\\I/g;
       $chaine =~ s/أژ/#I/g;
       $chaine =~ s/أڈ/~I/g;
       $chaine =~ s/أ¬/\$i/g;
       $chaine =~ s/أ­/\\i/g;
       $chaine =~ s/أ®/#i/g;
       $chaine =~ s/أ¯/~i/g;
       $chaine =~ s/أ™/\$U/g;
       $chaine =~ s/أڑ/\\U/g;
       $chaine =~ s/أ›/#U/g;
       $chaine =~ s/أœ/~U/g;
       $chaine =~ s/أ¹/\$u/g;
       $chaine =~ s/أ؛/\\u/g;
       $chaine =~ s/أ»/#u/g;
       $chaine =~ s/أ¼/~u/g;
       $chaine =~ s/أ؟/~y/g;
       $chaine =~ s/أ‡/\\C/g;
       $chaine =~ s/أ§/\\c/g;
       print Sortie $chaine;
   }
   #close file
   close (F);
   close (Sortie);
}


#
#ligne -> line
sub makeChaine
{
     my ($dir2open,$fichier) = @_;
     my $chaine = "";
     #opening the file
     open (F,"$dir2open/$fichier");
     while (my $ligne = <F>) 
     {
           $chaine .= $ligne;
     }
     close(F);
     return ($chaine);
}

#We can delete this function it s operationnal
sub appel_Sygmart 
{
     my $chaine = @_;
     require LWP::UserAgent;
     my $ua = LWP::UserAgent->new;
     $ua->timeout(10);
     $ua->env_proxy;
     my $response = $ua->post(' http://www.lirmm.fr/~chauche/cgi-
bin/runsygmart.cgi', {
              Services => 'lslm',
              FormeSortie => 'lslmsc',
              filtre => 'tout3',
              texte_entree => $chaine,
              Charset => 'utf8',
            });
     if ($response->is_success) 
     {
         return $response->content; # or whatever
     }
     else 
     {
         return $response->status_line;
     }
}



#
sub Tagger
{
     my ($dir2open,$fichier,$rep2tag) = @_;
     my $texte = makeChaine("$dir2open","$fichier");
     my $textetag;
     #Fichier de sortie
     open (Sortie,"> $rep2tag/$fichier");
     $textetag = appel_Sygmart($texte);
     print Sortie $textetag;
     close (Sortie);
}


Run("../article/Donquichote");
Run("../article/ParisElection");
Run("../article/SarkozyCarla");
Run("../article/SkiGrange");
Run("../article/Tf1DaylimotionYoutube");

Hi 2teez,
I totally forget that this website is in english so i tried to convert my script in english i hope that it will help you

Hi,
I would have loved to see how your data raw data look like, because I don't think you will have to use s/// these much parsing your data.

Secondly, you could module File::Find instead of the trying to handpick your files from the directory.

It a lot better to use three arguments open function and a lexcical filehandler than you are presently doing something like:
open my $fh, '<', $filename or die "can't open file: $!"

Hello,
firstly i thank you to take time to answer me.

Here exactly what i should exactly do: 

Established the corpus.
Prepare our project structure.
Write a Perl script that :
      1 Browse the corpus.

      2 Cleans files and makes the necessary substitutions SYGMART .

      3 Call Sgmart and save the result .
The purpose of this project is to implement and evaluate a document classification method programmed in Perl.
**First step: formation of the corpus**
In a first step, a body should be formed . We propose to develop a body of five distinct themes (for exemple: politics , cooking, etc. ). This corpus will be normalized (removal HTML tags , etc ) . To do this , you will find ten texts written in French or English relating to each of these five themes.
**Second step: implementation of a classification algorithm**
Further work will be to implement a classification algorithm . many
learning approaches can be used for text classification :
• K nearest neighbors
• Decision Trees
• Naïve Bayes
• Neural Networks
• support vector machines
In this project, we propose to use the well-known method of K nearest neighbors ( KNN ) view
in progress.
Third step : taking account of linguistic information
The goal here is to use your texts with different information:
• Gross Texts .
• lemmatised Texts .
• Texts lemmatised with parsing .

**The project structure** as I see it is this:

ROOT
|____REP Article
     |____REP Donquichote
          |
          |
          |____REP Art
               |
               |
               |
               |____Txt files
          |
          |
          |
          |
          |
          |____REP clean
               |____Txt files cleaned
          |
          |
          |
          |____REP tag
               |____Tagged files in .txt format
          |
          |
          |
          |
          |
          |____REP vect
               |____Txt files
     |
     |____REP ParisElection
          |
          |
          |____REP Art
               |____Txt files
          |
          |
          |____REP clean
               |____Txt files cleaned
          |
          |
          |____REP tag
               |____Tagged files in .txt format
          |
          |
          |____REP vect
               |____Txt files
     |
     |____REP SarkozyCarla
          |
          |
          |____REP Art
               |____Txt files
          |
          |
          |____REP clean
               |____txt files cleaned
          |
          |
          |____REP tag
               |____Tagged files in .txt format
          |
          |
          |____REP vect
               |____Txt files
    |
    |____REP SkiGrange
          |
          |
          |____REP Art
               |____Txt files
          |
          |
          |____REP clean
               |____Txt files cleaned
          |
          |
          |____REP tag
               |____Tagged files in .txt format
          | 
          |
          |____REP vect
               |____Txt files
          |
          |____REP Tf1DaylimotionYoutube
               |
               |____REP Art
                    |____Txt files
               |
               |
               |____REP clean
                   |____Txt files cleaned
               |
               |
               |____REP tag
                    |____Tagged files in .txt format
               |
               |
               |____REP vect
                    |____Txt files
|
|____REP Binary
     |____Executions files
|
|____REP Data
     |____...

hello do you have the solution please??

Be a part of the DaniWeb community

We're a friendly, industry-focused community of developers, IT pros, digital marketers, and technology enthusiasts meeting, networking, learning, and sharing knowledge.