You are on page 1of 9

A Statistical Learning Model of Text Classification for

Support Vector Machines

Thorsten Joachims
GMD Forschungszentrum IT, AIS.KD
Schloss Birlinghoven, 53754 Sankt Augustin, Germany
Thorsten.Joachims@gmd.de

ABSTRACT       "  


       "
  
                  .     
"  
                        $     
 
                  *  "      3  
      
            !   
 "   $ *     4  "      "
#   
        $            Æ     
  "  "   
 $               
  "           
      $       !  % 2. SUPPORT VECTOR MACHINES
&"   
          ,-5/  
  *"      * 
       ' 
 "( )                       
          ( &  Æ   "   *  $    
        "        *       6    7  * *"  
 
 "(              

 
 6         $    $   - 7-
1. INTRODUCTION 8    " * 
 $      " 
     "  
  "        9        
     *                    Æ$   
    "             8 - 8   *    $    
  $  "      
    +
        
*    : 
   ,--$ ./              "   !
  
   
       
               *  ,-5/

      
     ) 
$   *  
*  0             
  "
     $    "    *    
  6 
- 7
 
 
 -
 "               ; 
 
 
 
 %  ,   7 /  -  
;
   $      *     
 %   < =
          *     
           *    *       ; !      
    "            
*
*     
 #    $    * 

  "        *   Æ
 " *           

   > ?     " $     
      ! -  $    
 *                 *    *       
                    -          '   

      " @        
         1      "   Æ 6 -  
    $     "  "        
 A+-  "$        
    

     
Permission to make digital or hard copies of all or part of this work for
   6

 

-        .
;        

personal or classroom use is granted without fee provided that copies are
not made or distributed for profit or commercial advantage and that copies 
bear this notice and the full citation on the first page. To copy otherwise, to 
republish, to post on servers or to redistribute to lists, requires prior specific 
   6 < 3
permission and/or a fee. 
  ,-/ % <

 4
SIGIR’01, September 9-12, 2001, New Orleans, Louisiana, USA..
Copyright 2001 ACM 1-58113-331-6/01/0009 ... 5.00. 2
AB#F1G9 H#I HA9 )A9 +GA+9GI
        !   
δ
*"       
  "  H 
h* )            H  ) 
δ        *    $
δ     
#J$ :A@AFB19B @1#G1F 9@B 1FK
#J : L  A  D :  * " 
:     @  D : 
   "  
       :     L  * 
 
     
 7     1 D :    A  @   
         £      

            
  Æ          M#:9 1K # B1F A8 &1 8F@D
          
 M  B        B
       ! *" #1 D  
 BA       *"  &  1
B  "      £ 6   £ £ £         
      #1 >A
       
          

      *   $     
      *" &1    BA      
  " *     8          $       Æ        * 
      *         
 
$? M &$  Æ  M L
6


    6   
   C
1   B
 $ G 
 9B 1 8 1@ A H#I @A )A@D KA@D
8G
            <   
  8   *   $       #K H    "   9B 1 8  F
   "        A   "   L I )  F 0  "  
 "                 *"  *      I L C-- 
"   "          $ I )  9  F    
 $   *"       6 <  A+-$ 
    "         
       *     
!
  " *" 
     3  A+; 8       + )  F 
   
       $    
* "  
       !    "  #!$%&
  *  "  8        '  ( )      
 $
  
,-;/     
       
     *             
 *    ,;$ -5/
3. PROPERTIES OF TEXT-CLASSIFICA- Heterogeneous Use of Terms. :    .  
TION TASKS    8 ; 1    G ;-3C5   
       *  "           " >    !  ? @
  $
               $    
  *   
  
"  
"   " "               $      "  
D
  *       $            "          
  %    > ?$ > ?$ >?$ > ?$ > ?$ >?$ >?$ > ?$
>  ?$  >? 1             
High-Dimensional Feature Space.       "  "     *    * 
      $       *       !       ) 
$ 


                 "       >   
           $   !  ?$ 0  "  ' 
     *         
    =<$<<<     *  High Level of Redundancy. &     "
" '    
      $ 
Sparse Document Vectors. &        
          
     $       "    "     *  - ,--/        
*           
    G  >    !  ?   " 1
 
"            
   
    *"       

 %EE      E
  9      *   ,-./   

step step step
2 3 1

 * + "   

   *" 9  +GH9  


  $    
  
-;<< 5N4           
;<-3<< C-=                
3<--<<< 4==      <

        
-<<-;<<< 35<
;<<-.<<<
.<<-NN.C
33.
.C3  7  
                
¾
 ƾ  ¼


    ;-5     

7-

  ,        '
) " 
   ¼ 6      -     ¼ 6   7 - 
      ! -    
     ! ;        
H"       "       "       7 -
-;<<$ ;<-3<<$ 3<--<<<$ -<<-;<<<$ ;<<-.<<<$ .<<-NN.C     *    ,-./ @       
    *  -    
              Æ$     " *  
        *          A     + *  - 8   $  ! 
 
  1     "   >  ?     Æ    " * *  *"    
 
 E *
   +GH9  ,--/        *     -   
*         
Frequency Distribution of Words and Zipf’s Law.  4.2 Step 2: TCat-Concepts as a Model of Text-
 !            Classification Tasks
OL  ,-N/ OL            *" #    "$     *   "    
  !"$      !               " 
    
  "     !"     !             " 
           *     *
*   "      *  
 *  
 
" ! "$        
" !    
    * $ 
! "     "  *       
  "       
      
4. A DISCRIMINATIVE MODEL OF TEXT        
:        "      
CLASSIFICATION &        "   $   
  
                   
        
       8  
      #            $    *      
    8 =$  
       *      '     !"   
              *
   "        8 .  "
                          >  !"  ?
    *          +    "        
              "      
    
  $    
   *               ;<         -<<   
    $                !"    
   " "        @    
4.1 Step 1: Bounding the Expected Error  "   
    !$   
Based on the Margin        ;< $  ;< '
   *  ,-.$ -5/              $      * 
*               - (    -$<<<  !"
"        *  
         8   *  4<<  
   ,-<$ -=/  " !      Æ  $     
   
   
     ;$   9        " *  3  H    
1
0
0
1
0
1
0
1 5"6 ,      *     -<<  
    
" ! "$    -$<<<   
20
0
1 50 words per document
positive
   !"$      -<$<<< 
11100 words in dictionary
documents
!"      *  OL 
 *         $    
5
              *
100 stopwords (irrelevant)

1
0
0
1
4
0
1
         "
200 positive

600 irrelevant

11
00
00
11
1 00
11 10
00
11 11
00
9

   !" # $%$ #


00
11 11
00
00
11
200 negative

300011 1
00
positive
00
11 3000 negative 4000 irrelevant

 #  


00
11
1
00
11
11
00
00
11
1 9 10
00
11
00
11
 , %  %  /  , %  %  / 5
11
00
4
00
11
5

    $   %    


   
  &   '         (
negative
  "            
 "      "      
documents
11
00
00
11
20
00
11            
 
00
11

 .    " /              
    
  ! * *"  :   
    ;<<       "  ) 
$     *         
  
   
  $  
 "       " ,-./ 1 "     
 $        .       8 .$   "
"    
  
    " -    
   * *  
  G 
 "$         .  ,;<%;<%-<</ P  !
    
      ,.%-%;<</ ,-%.%;<</ ,3%3%4<</ P   !
 "      
   ,N%-%=<<</ ,-%N%=<<</ ,-<%-<%.<<</ P  !
, (    "$     -<$<<<   
  !"     $    *    &        $    * 
 .$<<<       -<             :   (
*    
   
   H  
    =$<<<       "  Empirical Validation. :        
  
   
      !"  N    G ;-3C5 $  &*KH $   A 

 -         "     "  *
        *             :   
            =( F         " > ?    &*KH
    8 $           0 
01   2  +    --$-<<       
   $  
   $   

$                #      " ,-./     
       *"     $   N5 !"   
+  1  3 9    " 3<       
         
   $       --$<3<  ;  3; !"        
 
    
          <3 1      
 
  8 3   "$   .=- =.-  
0 , " #         !"          
  
  
.  !"     N !"             3    <;   
           :   !"     3$<.3   
     
     3<   $     "       -<  ;.$;C4  
     

    "     <- 1      
* " 
0  4 "  H     
    "     
 
    
     ;<<    *             $ 
!"         =$<<< !"    "   "     
   8       *  * "      $  >
?   
*  .     !"     N     "  1 
 &*KH   ;CC   
 !"        :    8    
       " ? ?$ 
  "   !"   $    
 ;CCQ   ;CC         N5
   * 3<         !"   
           
      !"     
 %EE  E E ;-3C5 
       "       
 %EE EEE 0 E  ;<EE  
        = 
 %EE   E*E 
 "(   "(   "( 
 7&  .*  $8.$ 
 "       *      <<; <<N << <<  <- 

 *  * *              <- <; <=  <= <=
              <. 
           
            "  *   *   
  *   !-  !;
             "     !=    
                      
  " *    *          
  "   "   
 
       0
    "   
    "  "       
"  "      " "    " 

           "    < <* <*- < < < <  
              
  
 "  <5; .' -<< -<< -<< 
           Æ       -<;<4;< -<; -<=
  "     *   *     
 "               
   *    
 * "   
 

 
       
        
    
 
   
     *   
       * 
              
 
    "            

 "
            "  "    " <<
 $!  *.  !.!%9 
 "(   "(   "( 

 $ 2    "  :


;<   ')   
  "(  

   " -<.Q      


  
 @            >  "?  *
  1       "        " '       
 
     $ "  *  "     :               
   :          
     '  
  ,CC % ;N % N5/ ,. % ;- % 3;/ P  ! *           
   
,-4 % ; % .=-/ ,- % -; % =.-/ P   !       
,N % - % 3<.3/ ,- % ;- % ;.;C4/ P  ! 4.2.1 Step 3: Learnability of TCat-Concepts
,-4N % -N- % 5--4/ P      
       *  : 
      *           
                             : 
 &*KH   " > ?  *      :       " *        #
  $                  ! *" OL $   
          *           9      
 
      "  *   ,-./               
  :       *         *  "    :   
8   G ;-3C5   " >?      &   "  '#%(  $%$ #
   :  
   ,== % ; % 43/ ,=; % 43 % -3;/ P  !   , %  %  /  , %  %  /    
,; % - % -C-/ ,= % ;- % NC./ P   !           
 Æ  
,= % - % =.33/ ,- % -< % -C<;</ P  !
,C5 % 3; % 35;-/ P  6

 ¾



 
 
    
  6 N
  
           A  

Æ   
 7 ; 7 

 " >  "?   ¾
6 

 
  ,; % - % -</ ,- % . % ;;/ P  !  
,; % - % N;/ ,- % ; % N./ P   !
,3%-%.<5</ ,-%-<%;<N;;/ P  !
 B  6         6       $
,-NC % -N< % -=.3N/ P          !          
   
     "     ! $ £     
 
  -N <
   
               %$¼ 
      * 
        6 <  6 ' &  !   &'   - ;<
          *  %
-   *     -N "      
    6 "  -< %
;   -
  ,   /  -  £ 6 - ' &  !   &'   -
$¼ #  ;-
;
      ' &  !   &'     *   
 --
 ,  /  -
    
     
' &  !   &'   6
  !     !   
 ;;
 
 "          
   £ 
  Æ 6 ;  £   H"       
      * $    *   "       6
  !   
 

  !   
 
;=
    *     F     
                
6

-
 
 ;.
   ! *         6 # !#   
  
  #     "       -- 
  *    H"   $        *     ;-       
  *                "    $  
          
    !
     "        :   $
 #  -  $  
 "$  #
-     
 
              
 * 
  "  " *        
@   *  "        

  &        "      
 ¼ #  6 " #  !#
- -; "    $ ,-./         
; *   
  #  - -=   *    9     
  
  *     *   "  
 #
- -. - :  "$  
       (   
F # *     
£   #     $   
¼ £ £  
  9        ( @
  $
 Æ  ; ¼ #£      *  ¼  £    *  
"     
   * 
     *     #    *  !  "     "    *    
           F   $ OL 
      ¼ #£  !  
  $#    £  1    !  
"   
     OL  ,-3/
-
$#     6 #  !#    #  - 7    # 7 - -3
;  ! 6 ;3
 7 )
         <     <   F   "  
  )  3$ *  -=$   
           -=  -. #           *  OL 
     "             
%$#     !"  
"      R   "
%#
6< -4         !     
                    #        !"    *  OL
      $        
# 6 !   ,    / -C     
  OL    
 "$
     "          
8          !
  "   !"$   
     
 "  
9    
# 6 !   &'   -5
&  & "     # ) 
  & 6  $ ' 6 %- -$    6      %  !  !       ( 
   "   *      F  "       *+ 
  
-      ! 6  7 ) ;4
$  6 -   -N
;  ' & ! &'     ! %    ! (
                       "     !  

  6     *0   <     < 
 "   *        $    * 

   
         <     < G
 
      "   *0 
      
 7   
 
%  

6(
)   7 )
 
          $¼  £    

 8       *   !"  
      *   !"       
  !   G  ¾
ƾ


&*KH
 

ƾ
3-N

<



 --.= <  " -4=4 <
 ! 6 ;C  0 C.- <
 7 ) ! -5.5 <
  
    %      
 
   !   !   $  9    
 "

-.5N
353
;C
<
  
A
-355

ƾ
<



 
  
      5-< . 
 
  54N N +  " --4-. <
  6 ;5   ;<5; == : 
 .=5C <
  7 )  .35 < @   ;545 <
 .<3 ; @
 " ==<= <
  =C5 <   ;334 <
: * F -  F ;     -  
      
 ! = >         
 "  # !%?9$& " ?  :
;<
   &   )  $%$ # *&?*$7 " ?   @   *&?9%7
  , %  %  /  , %  %  /    "  "  6 3<  

     (            ? 21    "
 *+   ! 6 
      
 #  @?      "
    & '         :
;< 
   =    " 
           @   
 6
 ¾




 
 
8 $ *  ;      A   $ 
7; 7   6



 

  

  
 ¾ &*KH $    G ;-3C5     
7-     " *    6 <     
 
6 


 

 "       
       
6   
  " $     
     
 


 *  "    G       
     
  *        "     
  %  6  % 6  ( *
          
 
  $ *  "   *       * 


     ! -       ! ;            


     
;   ! G    $  &*KH   $
  
   ! A      
 "$

 #    :     *   Æ   *     >' 
? *   
    * $        
       L  :   ,-5/ :  
   "     $      


    

-   
 
'    $       -  
;N
  " "  
             
7- Æ       *

     :   
         "    "  
        Æ " 
     F     
- 
   *    Æ   *  *  
   
 




 7 ; 7
 

=<
5. COMPARING THE THEORETICAL



Æ    MODEL WITH EXPERIMENTAL
RESULTS
     
   *          
       " *    
  9      "  
                 
      !    
 F ; *        



      OL   ! 6 
$    *          8  $ "     
          :   $ 
  "
"         
*      :       
 
 "    
      0  $ *  
Empirical Validation.  :          0          

           
   "  " *    6 <$ 
$               
   *"             
      
   Æ         
     *           .;  :      &*KH   "
   @  (  D
     
    
 
    "    $  *        $
&*KH > ? --;Q ..Q   !"    
    
G  >? -3Q -=Q  
A >  "? N.3Q ;=-Q 1  B "  +   

 * /       
* " ' *     

          '   *  " 
    * 
 A 
        " $   '          
 :
;<   ')?  #   ! 
 '  )?   @   '  , " #        "$ 
 )       6 -<<< =    *            
             *  
 "    
 
  "$     *   "  ' 
> ?      8  $     8B8     ' 
    
OL      &*KH      6 .C<<<<$         ,-./
) 6 3$  * 6 -;3 *0      
 ;$ *       
    * 
               7. LIMITATIONS OF THE MODEL AND
  OPEN QUESTIONS
    

<;==-  -5NNC
..= =- 9
"   *       "      
7- 7-       "        
    *     : ! "$ 8 $      "   
    =N3C               OL $   
  
            --;Q          "     
1         G    " >?        $        
 *   !  
<-5<;  C4;N
-=5   $      *    
    
=;       :    &    
 
7- 7-         *       * 
              N4<=   $   *    *   "  
     -3Q   "$  *      *       $ *  "  
A   " >  "?   *             
C.-;=  -;C35
N.3C ) 
$            * 
    
==  $         *  & 
7- 7-
    "     ,-./      
               N.3Q     *      $  *  
  -<$<<<      *    1    $    
*  =                   *        "    
  *"                       * 
  *
    &     *  8 "$         
      "          "  *         &  
  *
     $ *  =         
 Æ         
             Æ *  "       $  *   " * 
     $     "         "     !    
>?    $ > ?      $        *    
  >  "?     Æ  &  
:     ** "       *  *  8. RELATED WORK
                 
,-=/$  
  
      :       &           *  "  
    "            
       $     " "  
  *  "       *    ,-./    0     
            
 H" @

6. SENSITIVITY ANALYSIS: DIFFICULT H"     " 0         
         ,=/ ) 
$
AND EASY LEARNING TASKS       "   *      
 
     
    *     &               
          *      :  
   
    ,-C/$    *   
           D    '   
        8   
 $      *    "  "    
             Æ
         >"?  > Æ ?    
         $  
   ,-./     
         
    $ *  "            "     *0   
*         *     " 1)-5$  4<4S4;=$ -NN-
 "    *  
       ,4/ @ 8  D K   G 
  
     
    *      E"  :

1     *        G0*$   $ 1  ."  
 ;+     ,-/ ) 
$   H     )    1 " 2     , 
  0  *"   ,5$ N/ B       6#  -#  $  =N-S.<5
,C/ ,4/ ,3/ 
         
  :*  #
 " +$ M " -N5.
    $ * !      ,C/ @ DT
 $  F $  @ 8 1  **  
  "   
                  &*
&  '     
        $       2   #)/33 4  -#
 +                )    #   )    
     ,-4/ "        /   $  .C3S.5;$ K : "$
      *        $ #$ -NNN 1: +$ @ I $ #

    *    *        ,5/  + )  1  **       
         "    ' "    + %    *   
    *   ) 
$ "        "          ,   
                 -      )    $
"  
       *   ;4.%-NCS;<4$ -NC3
,N/  + )  1  **       
9. SUMMARY AND CONCLUSIONS "    + % 1      
  
            **     ,    -
         *  !   
 "      )    $ ;43%;5<S;5N$ -NC3
                  ,-</  M   B )  + **   
                    #   -)     $
   
   #   
    -NNN
 
   $    

     ,--/  M            
 
    
           % F   "  
   
          Æ    2    (  #   
      $ 
        7$  -=C S -.;$ H $ -NN5 
 "     ,-;/  M      
H    
   $         H T  $ : H$  1  $
    
           $ -"  /        
             7$   --  +$ :* $ 1$
     *            -NNN
  
              ,-=/  M  9        
      * 
           Æ "  2   
  *    "  S   
 " S    "$ )    #    7$ 
  
      $   
 8 $ ;<<<   K
    !"   Æ      ,-./  M    - 
        8 "$    
     7  #$      
*   
             -   +B $ #
 T B   $ ;<<-
      K $ 
,-3/ H   *  1          *  
10. REFERENCES   % 1 "   !    *" ) 1
,-/ 1 H    B G   + **        )     #   $ ;-%N<SNN$ 1
       ,    - -N3N
     )    $ ;33%=-;S=-5$ -NC. ,-4/ : ) +   $ + G
$ ) $ 
,;/ : H 1     
        F     % 1  **  
      .    /   "  1:$   $ 25. +34 2  
. "$ ;;%-;-S-4C$ -NN5  -# )0-#)05.)0-1   
,=/ & :           2  .     , 8334  
 **        
   )    9 $  -3NS-45$ @ I $ @I -<<=4$
-# )0)1 #   1  #1$ -NN5 1: +
."   )    1 "$  3CS4-$ ,-C/   :  7 ) "
-NN- )    - +B $    #
 "$
,./  B$ M +  $ B )$    -NN5
  
             ,-5/       7   & "$
        2   -##)/34$ : $ DH$ -NN5
@
* -NN5 ,-N/ D K O ; 6"    2 
,3/ @ 8$  ) $ D F $   $ 7 (<   - )     ; (  
K  $  D K   1E    * 1  & "$ :* $ 1$ #1$ -N.N

You might also like