Zornitsa Kozareva USC/ISI Marina del Rey,...

Post on 21-Mar-2019

216 views 0 download

Transcript of Zornitsa Kozareva USC/ISI Marina del Rey,...

!"#$$%&'()*+&,-./0&123452)2-(.6-&

Zornitsa Kozareva!USC/ISI!

Marina del Rey, CA!kozareva@isi.edu!

www.isi.edu/~kozareva!

7(-8(50&9:;&<=:9&

!"#$%&'()*+)"'

>?6&23&7*550&@6AA3B'!"##$&CD&%&''(,'-..$"//0'1!234!4'5676'-.8+$#9:;'<#;',,,'!"##$'=,'%&''()*&''(>+/+',".?'

-@<,%A8'7*550&@6AA3BC?*+DE"'-8"$+%#F/'@A/:'<#):".'+/'#'9A)*B$?))+)*'-8"$+%#)'GH'

/&AI'J$A.?%".'K;'LM:&'2"):?$;'CANO'#).'+/':&"'9A)*"/:B$?))+)*'J$A*$#8'AP'#);'Q+).'+)'

:&"'&+/:A$;'AP':&"'DDD'-99"*".'R+99"$'S#.'S")+".'TA).O'!"$E+%"/'!":' PA$'GIA'U+V9"'W+$9.,'-' ' X?.*"'&#.'.")+".'

KA).' PA$' !"##$) %&''(,' DDD' !"##$) %&''(O' I&A' +/' #%%?/".' AP' Q+99+)*' &+/' YB;"#$BA9.+)' :&+/'?).#:".'DDD'

7*550&@6AA3B'<+Q+J".+#O' :&"'P$""'")%;%9AJ".+#&S$,'!"##$'=,'%&''(' ZKA$)'L['\#)?#$;']Y5L^'+/'#'J$A8+)"):'$"/"#$%&"$'+)':&"'_"9./'AP'9''<A$9.'@#$D#9'-$:/'W#8"/'=#)Q0'`$.'S#)O'S+/%+J9+)"0'\?B\?:/?'4//?".'T;0'!"##$)%&''('

]M:&'S#)'=#)Q0']/:'

&&&E2/?&'()*+&,-./0&C*46F-2.6-& GCH&I,C&

a'

''''a'

a'

L'

J+*(KK0;&E*&E(-/&

!"##$)%&''(O'#'_b&'*")"$#DA)'P#$8"$O'

<#;)"'2$;:/'_)+/&".'&#$E"/D)*'&+/'%$AJ'

!"##$)%&''('+/':&"'$#*"B_99".O'.A8"/D%B

#K?/+)*'%#$""$'%$+8+)#9'I&A'Q+99".'&+/''

!"##$)%&''(O'I&A'+/'#%%?/".'AP'Q+99+)*'&+/'cB;"#$BA9.'.#?*&:"$'#).'&"$'K"/:'

S$,'!"##$'=,'%&''('+/'#'J$A8+)"):'$"/"#$%&"$'

+)':&"'_"9./'AP'%A8J?:#DA)#9'9+)*?+/D%'

!"##$'=,'%&''(,'-..$"//0'1!234!4'5676'-.8+$#9:;'<#;'D'

`'

d('="%A*)+DA)'E/,'d('S+/%$+8+)#DA)'

•  d(' ="%A*)+DA)' e' .":"%DA)' f' %9#//+_%#DA)' AP' ")D:;'8")DA)"/'+):A'#'J$"."_)".'/":'AP'%#:"*A$+"/,'

! ''#%&+"E"/'A)9;'#'J#$D#9'.+/#8K+*?#DA)'AP')#8"/'

•  d('S+/%$+8+)#DA)'e'_).+)*':&"'#%:?#9'")D:;'.")A:".'

K;'#'J#$D%?9#')#8"'A%%?$$")%"'+)':"N:,'

I,C&

a'

''''a'

a'

<&;'+/'+:'%#99".'gS+/%$+8+)#DA)ha'

123452)2-(.6-&

B &/?*&/6/(K&-8)A*5&6L&3*-3*3&23&8-M-6E-&

B &/?*&)*(-2-F&6L&*(4?&3*-3*&23&8-M-6E-&

B &-6&3N*42O4&)(NN2-F&6L&4K83/*5P&3*-3*&

123()A2F8(.6-&

B &/?*&/6/(K&-8)A*5&6L&3*-3*3&23&M-6E-&

B &/?*&)*(-2-F&6L&*(4?&3*-3*&23&M-6E-&

B &/?*&65+*5&23&A(3*+&6-&/?*&L5*Q8*-40&

&8"#)+)*']0'

''''''''''':&"'/9AJ"'K"/+."'#'KA.;'AP'I#:"$'

''''''''''''8"#)+)*'L0''

''''''''''''."JA/+:A$;'_)#)%+#9'+)/D:?DA)'

R&

A(-M& 7*550&@6AA3&

*$A?J]'

*$A?J'L'

*$A?J'`'

5'

48JA$:#)%"'AP'd#8"'-8K+*?+:;'A)':&"'<"K'

•  i?"$+"/' #KA?:' d(/' %A)/D:?:"' /+*)+_%#):' JA$DA)' AP'<"K'j?"$+"/0'

–  ]]B]7k'%A):#+)'J"$/A)')#8"l'

–  5k'#$"'#KA?:'#'J"$/A)')#8"*

•  4."#99;O' /"#$%&' $"/?9:/' /&A?9.' K"' %9?/:"$".' /?%&' :&#:'"#%&'%9?/:"$'%A$$"/JA)./':A':&"'/#8"'+).+E+.?#9'

–  P#/:"$'P#%:'"N:$#%DA)'

–  8A$"'#%%?$#:"'+)PA$8#DA)'$":$+"E#9'

['l'/:?.;'K;'\#E+"$'-$D9"/O'LMMY'

m)':&"'<"K'n'

•  dAKA.;'Q)AI/'&AI'8#);'/")/"/'Z8"#)+)*/^'#$"'

:&"$"'PA$'#'*+E")'J"$/A)')#8"'

•  4:'+/'+8JA//+K9"':A'"/D8#:"'#).':$#%"':&"'8A/:'

P$"j?"):'/")/"'

–  :&"':#/Q'+/'D8"'%A)/?8+)*'#).':".+A?/'PA$'&?8#)/'

– )"I'<"K'J#*"/'%A)/:#):9;'#JJ"#$'

– A9.'<"K'J#*"/'8+*&:'K"'."9":".'AE"$'D8"'

6'

d#8"'-8K+*?+:;'+)'<+Q+J".+#'

He is seen as a national hero by those who live in Georgia.

c'

?SN%TT3*(54?D2-/*K283D46)T&

1,!,'2")/?/'T?$"#?'/:#:"/'YMOMMM')#8"/'#$"'/&#$".'K;']MMOMMMOMMM'J"AJ9"'

d#8"'-8K+*?+:;'+)'S#:#'T#/"/''

Y'

?SN%TT3*(54?D4(556/<D65FT3/(AK*T3*(54?&

!(556/&"*(54?&C*38K/3&!K83/*52-F&,-F2-*&

GA.#;'

]M'

?SN%TT3*(54?D4(556/U3*(54?D46)T4(556/<UE*A(NNT3*(54?&

!(556/&"*(54?&

GA.#;'

oAI'%#)'I"'/A9E"':&+/'J$AK9"8a'

G;J"/'AP'@#%&+)"'U"#$)+)*'

•  1)/?J"$E+/".'U"#$)+)*'–  %A$$"%:'$"/JA)/"/'Z:#$*":/^'#$"')A:'J$AE+.".'–  :&"' #9*A$+:&8' +.")D_"/' /+8+9#$+D"/' K":I"")' :&"'

+)J?:/'K#/".'A)'/A8":&+)*'+)'%A88A)'

•  @":&A.0''

–  29?/:"$+)*'

•  dUp'G#/Q/0''– d#8".'()D:;'S+/#8K+*?#DA)O'G"N:'2#:"*A$+q#DA)'

]L'

29?/:"$+)*'

•  -$"':&"$"'#);'g*$A?J/h'+)':&"'.#:#'a''•  <&#:'+/'"#%&'*$A?J'a''

•  oAI'8#);'*$A?J/'#$"':&"$"'a''

•  oAI'.+.';A?'+.")DP;':&"8a'

]`'

29?/:"$+)*'

]5'

]5'

29?/:"$+)*'K;'

%A9A$' /+q"'

<&#:'+/'29?/:"$+)*a'

•  29?/:"$+)*'+/':&"'J$A%"//'AP'*$A?J+)*'#'/":'AP'AKX"%:/'+):A'%9#//"/'AP'/+8+9#$'AKX"%:/O'I+:&A?:'

:&"'&"9J'AP':$#+)+)*'"N#8J9"/'

– %9#//+_%#DA)'E/,'%9?/:"$+)*'

]['

-JJ9+%#DA)/'

•  29?/:"$+)*'+/'#'%A88A)'#).'+8JA$:#):':#/Q':&#:'

_)./'8#);'#JJ9+%#DA)/'+)'!%+")%"O'()*+)""$+)*'

#8A)*'A:&"$/'

– *$A?J'*")"/':&#:'J"$PA$8':&"'/#8"'P?)%DA)'

– *$A?J'+).+E+.?#9/':&#:'&#E"'/+8+9#$'JA9+D%#9'E+"I'

–  +.")DP;'/+8+9#$'AKX"%:/'P$A8'J+%:?$"/'

– %#:"*A$+q"'.A%?8"):/'AP'/+8+9#$':AJ+%/''

– .+/#8K+*?#:"')#8".'")DD"/'ZA?$'"N#8J9"^'

]6'

29?/:"$+)*'p$A%"//'

•  S"_)"'#'P"#:?$"'E"%:A$':A'$"J$"/"):':&"'.#:#'–  Ab")'%#99".'#'E"%:A$B/J#%"'8A."9'

•  !"9"%:'C"#:?$"/&

]7'

Set of Objects

Feature Representation

Similarity Measure

Element Grouping

Output Interpretation

C"#:?$"'="J$"/"):#DA)'

]c'

• 'T#*BAPBIA$./0'"#%&':"$8'+)'#'.A%?8"):'+/'#'P"#:?$"'AP'

:&#:'.A%?8"):'

C"#:?$"'="J$"/"):#DA)'

]Y'

• 'GC0':"$8'P$"j?")%;'

•  ."_)+DA)0'GC'e':+X''–  frequency of term i in document j

•  J?$JA/"0'8#Q"/':&"'P$"j?"):'IA$./'PA$':&"'.A%?8"):'

8A$"'+8JA$:#):''

(N#8J9"'

LM'

2&#J:"$]''''''''''''''''''''''''''r:"$8/'

> ' ' ' '56'

>> ' ' ' ']6'

#'' ' ' ' ']MM'

##$.E#$Q ' ' ']'

#K+9+:; ' ' ' '`'

#K9" ' ' ' 'L`'

n'

%A"s%+"): ' ' ']'

%9?/:"$/ ' ' ' ']M'

I+:& ' ' ' ']Y'

IA$./ ' ' ' '`5'

:&" ' ' ' 'LMM'

q"$A/ ' ' ' ']'

(N#8J9"'

L]'

2&#J:"$]''''''''''''''''''''''''''r:"$8/'

> ' ' ' '56'

>> ' ' ' ']6'

#'' ' ' ' ']MM'

((5+V(5M & & &:&

(A2K2/0 & & & &9&

#K9" ' ' ' 'L`'

n'

46*W42*-/ & & &:&

%9?/:"$/ ' ' ' ']M'

I+:& ' ' ' ']Y'

IA$./ ' ' ' '`5'

:&" ' ' ' 'LMM'

X*563 & & & &:&

Many low frequency words!

Can we adjust tf?!

C"#:?$"'="J$"/"):#DA)'

LL'

• 4SC0'+)E"$:".'.A%?8"):'P$"j?")%;'

• '."_)+DA)0'4SC'e'9A*Z+3,+^'– ni : number of documents containing term i – N : total number of documents

•  J?$JA/"0'8#Q"/'$#$"'IA$./'#%$A//'.A%?8"):/'8A$"'

+8JA$:#):'

• 'GC0':"$8'P$"j?")%;'

•  ."_)+DA)0'GC'e':+X''–  frequency of term i in document j

•  J?$JA/"0'8#Q"/':&"'P$"j?"):'IA$./'PA$':&"'.A%?8"):'

8A$"'+8JA$:#):''

• '4SC0'+)E"$:".'.A%?8"):'P$"j?")%;'

• '."_)+DA)0'4SC'e'9A*Z+3,+^'– ni : number of documents containing term i – N : total number of documents

•  J?$JA/"0'8#Q"/'$#$"'IA$./'#%$A//'.A%?8"):/'8A$"'

+8JA$:#):'

• 'GC,4SC'ZPA$':"$8'+'+)'.A%?8"):'X^'

•  ."_)+DA)0':+X'"'9A*Zd3)+^'

GC,4SC G"$8 <"+*&D)*'

m:&"$'C"#:?$"/'

L5'

• S"J").")%;'G$""'

• '-,.'&#/':IA'P"#:?$"/'+)%$"8"):".'K;'t]'–  subj: John – obj: solution

•  !&*,'&#/'A)"'P"#:?$"'–  subj-of:find

found

John solution a

to problem

the

subj obj

mod det

pcomp

det

29?/:"$+)*'p$A%"//'

•  i?#)D_"/':&"'%9A/")"//'K":I"")':&"'P"#:?$"'E"%:A$/'AP':IA'"9"8"):/&

L['

Set of Objects

Feature Representation

Similarity Measure

Element Grouping

Output Interpretation

<&#:'+/'!+8+9#$+:;a'

L6'

•  Hard to define, but we know it when we see it. •  Easier to think in terms of the distance between vectors

p$AJ"$D"/'AP'.+/:#)%"'8"#/?$"'

•  SZ-OT^'e'SZTO-^' ' '/$00"1#$)

•  SZ-O-^'e'M' ' ' '''''2&,(13,4$)&5)/"657/80863#81$)

•  SZ-OT^'e'M'+P'-e'T' ' ' '9&(8:;81$)/"<3#3:&,)

•  SZ-OT^'u'SZ-O2^'t'SZTO2^''=#83,>?63#)@,"A?3681$)

L7'

p$AJ"$D"/'AP'.+/:#)%"'8"#/?$"'

•  SZ-OT^'e'SZTO-^' ' '/$00"1#$)' ' 'm:&"$I+/"';A?'%A?9.'%9#+8':&#:'g-9"N'9AAQ/'9+Q"'TAKO'K?:'

TAK'9AAQ/')A:&+)*'9+Q"'-9"Nh'

•  SZ-O-^'e'M' ' ' '''''2&,(13,4$)&5)/"657/80863#81$)

•  SZ-OT^'e'M'+P'-e'T' ' ' '9&(8:;81$)/"<3#3:&,)

•  SZ-OT^'u'SZ-O2^'t'SZTO2^''=#83,>?63#)@,"A?3681$)

Lc'

p$AJ"$D"/'AP'.+/:#)%"'8"#/?$"'

•  SZ-OT^'e'SZTO-^' ' '/$00"1#$)' ' 'm:&"$I+/"';A?'%A?9.'%9#+8':&#:'g-9"N'9AAQ/'9+Q"'TAKO'K?:'

TAK'9AAQ/')A:&+)*'9+Q"'-9"Nh'

•  SZ-O-^'e'M' ' ' '''''2&,(13,4$)&5)/"657/80863#81$)''''m:&"$I+/"';A?'%A?9.'%9#+8':&#:'g-9"N'9AAQ/'8A$"'9+Q"'TAKO'

:&#)'TAK'.A"/h'

•  'SZ-OT^'e'M'+P'-e'T ' ' '9&(8:;81$)/"<3#3:&,)

•  SZ-OT^'u'SZ-O2^'t'SZTO2^''=#83,>?63#)@,"A?3681$)

LY'

p$AJ"$D"/'AP'.+/:#)%"'8"#/?$"'

•  SZ-OT^'e'SZTO-^' ' '/$00"1#$)' ' 'm:&"$I+/"';A?'%A?9.'%9#+8':&#:'g-9"N'9AAQ/'9+Q"'TAKO'K?:'

TAK'9AAQ/')A:&+)*'9+Q"'-9"Nh'

•  SZ-O-^'e'M' ' ' '''''2&,(13,4$)&5)/"657/80863#81$)''''m:&"$I+/"';A?'%A?9.'%9#+8':&#:'g-9"N'9AAQ/'8A$"'9+Q"'TAKO'

:&#)'TAK'.A"/h'

•  'SZ-OT^'e'M'+P'-e'T ' ' '9&(8:;81$)/"<3#3:&,)' ' m:&"$I+/"' :&"$"' #$"' AKX"%:/' +)' ;A?$' IA$9.' :&#:' #$"'

.+v"$"):O'K?:';A?'%#))A:':"99'#J#$:'

•  SZ-OT^'u'SZ-O2^'t'SZTO2^''=#83,>?63#)@,"A?3681$)`M'

p$AJ"$D"/'AP'.+/:#)%"'8"#/?$"'

•  SZ-OT^'e'SZTO-^' ' '/$00"1#$)' ' 'm:&"$I+/"';A?'%A?9.'%9#+8':&#:'g-9"N'9AAQ/'9+Q"'TAKO'K?:'

TAK'9AAQ/')A:&+)*'9+Q"'-9"Nh'

•  SZ-O-^'e'M' ' ' '''''2&,(13,4$)&5)/"657/80863#81$)''''m:&"$I+/"';A?'%A?9.'%9#+8':&#:'g-9"N'9AAQ/'8A$"'9+Q"'TAKO'

:&#)'TAK'.A"/h'

•  'SZ-OT^'e'M'+P'-e'T ' ' '9&(8:;81$)/"<3#3:&,)' ' m:&"$I+/"' :&"$"' #$"' AKX"%:/' +)' ;A?$' IA$9.' :&#:' #$"'

.+v"$"):O'K?:';A?'%#))A:':"99'#J#$:'

•  SZ-OT^'u'SZ-O2^'t'SZTO2^''=#83,>?63#)@,"A?3681$)'''m:&"$I+/"';A?'%A?9.'%9#+8':&#:'g-9"N'+/'E"$;'9+Q"'TAK'#).'

-9"N'+/'E"$;'9+Q"'2#$9O'K?:'TAK'+/'E"$;'?)9+Q"'2#$9) `]'

S+/:#)%"'@"#/?$"/'

•  W+E")':IA'AKX"%:/'B'#).'$'KA:&'I+:&','E#9?"/'' ' ' ''

' ' ' ''

%#9%?9#:"':&"'@+)QAI/Q+'.+/:#)%"'#/'

`L'

!

d(x, y) = xi " yip

i=1

m

#p

!

x = x1,x2,…,xn( )

!

y = y1,y2,…,yn( )

!

d(x, y) = xi " yi2

i=1

m

#2

!

d(x, y) = xi " yii=1

m

#

Euclidean distance

Manhattan distance

(N#8J9"'

``'

!

42 + 32 = 52Euclidean distance

3

4

!

4 + 3= 7Manhattan distance

reminder buying milk !from home to store!

(.+:'S+/:#)%"'•  GA' 8"#/?$"' :&"' /+8+9#$+:;' K":I"")' :IA' AKX"%:/O'

:$#)/PA$8'A)"'AP':&"'AKX"%:/'+):A':&"'A:&"$O'#).'8"#/?$"'

&AI'8?%&'"vA$:'+:':AAQ,'G&"'8"#/?$"'AP'"vA$:'K"%A8"/'

:&"'.+/:#)%"'8"#/?$",'

' 'G&"'.+/:#)%"'K":I"")'p#V;'#).'!"98#,'

& & &2&#)*"'.$"//'%A9A$O']'JA+):'''' ' ' '2&#)*"'"#$$+)*'/&#J"O']'JA+):'

' ' ' '2&#)*"'&#+$'J#$:O']'JA+):'

''''''''''''''''''SZp#V;O!"98#^'e'`'

&G&"'.+/:#)%"'K":I"")'@#$*"'#).'!"98#,'

& & &2&#)*"'.$"//'%A9A$O']'JA+):'' ' ''-..'"#$$+)*/O']'JA+):'

' ' ''S"%$"#/"'&"+*&:O']'JA+):'

' ' ''G#Q"'?J'/8AQ+)*O']'JA+):'

'''''''''''''''''''UA/"'I"+*&:O']'JA+):''''''

''''''''''''''''''SZ@#$*"O!"98#^'e'['

`5'

Slide adapted from CMU

29?/:"$+)*'p$A%"//'

•  G&"'#%:?#9'%9?/:"$+)*'#9*A$+:&8/'

– @2*5(54?24(K& (KF652/?)O' I&+%&' %$"#:"/' #' &+"$#$%&+%#9'

."%A8JA/+DA)'AP':&"'"9"8"):/'

–  I(5..6-2-F& (KF652/?)O' I&+%&' J$A.?%"/' #' /+)*9"'

J#$DDA)+)*'K;'AJD8+q+)*'/A8"'%$+:"$+A)'

`['

Set of Objects

Feature Representation

Similarity Measure

Element Grouping

Output Interpretation

TAVA8B?J'29?/:"$+)*'

B''T"*+)'I+:&'"#%&'"9"8"):'+)'#'/"J#$#:"'%9?/:"$'

B  @"$*"'%9?/:"$/'+):A'/?%%"//+E"9;'9#$*"'%9?/:"$'

B  ="J"#:'?)D9'A)"'%9?/:"$'+/'9"b'

`6'

Recommended reading: Chapter 14 on Clustering from the book of Manning& Schütze

GAJB.AI)'29?/:"$+)*'

B  T"*+)'I+:&'#99'"9"8"):/'+)'#'I&A9"'%9?/:"$'

B  S+E+."'%9?/:"$/'+):A'/?%%"//+E"9;'/8#99"$'

%9?/:"$'

B  '="J"#:'?)D9'#99'"9"8"):/'#$"'+)'/+)*9":A)'

%9?/:"$/'

`7'

Recommended reading: Chapter 14 on Clustering from the book of Manning& Schütze

29?/:"$'p$AN+8+:;'(/D8#:"'

•  !+)*9"BU+)Q'– d"#$"/:'d"+*&KA$0':&"'%9A/"/'8"8K"$/'

•  2A8J9":"BU+)Q'

– C?$:&"/:'d"+*&KA$0':&"'P?$:&"/:'8"8K"$/'

•  2"):$A+.'– 2"):"$/'AP'*$#E+:;'

`c'

(N#8J9"0'!+)*9"BU+)Q'@":&A.'

'' ' ' ' ''(?%9+."#)'S+/:#)%"'

`Y'

Distance Matrix

p#$DDA)+)*'29?/:"$+)*'

•  2A)/:$?%:/'#'J#$DDA)'AP',)AKX"%:/'+):A'#'/":'AP'R'%9?/:"$/'

•  RB8"#)/'#9*A$+:&80'

],'!"9"%:'C'%9?/:"$/'#$K+:$#$+9;,''L,'4)+D#9+q"'%9?/:"$'%"):"$/'I+:&':&A/"'C'%9?/:"$/,''

`,'SA'9AAJ'

''#^'p#$DDA)'K;'#//+*)+)*'A$'$"#//+*)+)*'#99'.#:#'AKX"%:/':A':&"+$'''

%9A/"/:'%9?/:"$'%"):"$,'

''K^'2A8J?:"')"I'%9?/:"$'%"):"$/'#/'8"#)'E#9?"'AP':&"'AKX"%:/'+)'

"#%&'%9?/:"$,'

1)D9')A'%&#)*"'+)'%9?/:"$'%"):"$'%#9%?9#DA)''5M'

!

µk =1ck

xii"Ck

#

(N#8J9"'

M' ' ' 'M'

M' ' ' ']'

]' ' ' ']'

]' ' ' 'M'

M,['' 'M,['

[' ' ' '['

[' ' ' '6'

6' ' ' '6'

6' ' ' '['

[,['' '[,['

5]'

Task: Cluster the following objects into two clusters (k=2)

!

d(x, y) = xi " yii=1

m

#Manhattan distance Use:

Randomly initialize the clusters with the first two objects C1={(0,0)} C2={(0,1)}

Now: 2. Initialize cluster centers. 3a. Calculate the distance between each object and each cluster center, assigning the object to the closest cluster. 3b. Compute new cluster center for each cluster.

29?/:"$+)*'p$A%"//'

•  (E#9?#DA)'AP':&"'J$A.?%".'%9?/:"$+)*'A?:J?:'

5L'

Set of Objects

Feature Representation

Similarity Measure

Element Grouping

Output Interpretation

29?/:"$+)*'(E#9?#DA)'

•  2A8J#$"':&"'%9?/:"$+)*'A?:J?:'I+:&'#'*A9.'

/:#).#$.'Z8#)?#99;'*")"$#:".'#)/I"$'Q";/^'

•  (8K".':&"'%9?/:"$+)*'A?:J?:'+)'#)'#JJ9+%#DA)'

#).'?/+)*'+:/'"E#9?#DA)'8"#/?$"'

•  (N#8J9"0'/"#$%&'")*+)"'$"/?9:/'

5`'

p?w)*'G&"A$;'+):A'p$#%D%"''ZK#%Q':A'A?$'d(S'(N#8J9"^'

p$AK9"8'CA$8?9#DA)'

•  4)J?:0'– d' 1"B1) (,8<<"1(' :&#:' 8")DA)' #' J#$D%?9#$' J$AJ"$'

)#8"'Z+:'%#)'K"'J"$/A)O'A$*#)+q#DA)'A$'9A%#DA)^'

•  m?:J?:0'– R' %9?/:"$/O' I&"$"' "#%&' %9?/:"$' &#/' 1"B1) (,8<<"1(':&#:' #$"' /+8+9#$' :A' "#%&' A:&"$' #).' .+v"$"):' P$A8'

:&"'(,8<<"1()+)':&"'$"/:'AP':&"'%9?/:"$/''

5['

4)J?:''

•  S$,'!"##$&CD&%&''('ZKA$)'L['\#)?#$;']Y5L^'+/'#'J$A8+)"):'$"/"#$%&"$'+)':&"'_"9./'AP'%A8J?:#DA)#9'9+)*?+/D%/O'.+/%A?$/"'#)#9;/+/O'#).'#$D_%+#9'

•  !"##$)%&''(&+/':&"'$#*"B_99".O'.A8"/D%B#K?/+)*'%#$""$'%$+8+)#9'I&A'Q+99".'&+/'cB;"#$BA9.'.#?*&:"$'#).'&"$'YB;"#$BA9.'P$+").O'I+:&'/%#$%"9;'DDD&

•  !"##$)%&''(O'-?:&A$,'-'_b&'*")"$#DA)'P#$8"$O'<#;)"'2$;:/'_)+/&".'&#$E"/D)*'&+/'%$AJ'+)':&"'P#99'AP']YcM'#).'&#?9".'8A$"':&#)'`LMMM'K?/&"9/'AP'/A;K"#)/'DDD&

•  !"##$)%&''(O'I&A'+/'#%%?/".'AP'Q+99+)*'&+/'cB;"#$BA9.'.#?*&:"$'#).'&"$'K"/:'DDD'm)'<".)"/.#;O'#'X?.*"'.")+".'K#+9'PA$'!"##$)%&''(O'`5O'DDD&

•  C?*+DE"/'x'!"##$)%&''(&B'T$+"P'B'C#:&"$'S")+".'T#+9'-I#+:/'G$+#9'CA$'2&+9.$")'/'@?$."$/'\"$$;'T$#):A)'oAKK/'#%%?/".'AP':&"'/:#KK+)*'."#:&/'DDD&

•  !"##$&CD&%&''(,'-..$"//0'1!234!4'5676'-.8+$#9:;'<#;'DDD'!"##$&CD&%&''()*&''(>+/+,".?,'1!234!4O'5676'-.8+$#9:;'<#;O'@#$+)#'."9'=";O'2-'YMLYL'&

56'

m?:J?:'•  !K83/*5&:%&

–  S$,'!"##$&CD&%&''('ZKA$)'L['\#)?#$;']Y5L^'+/'#'J$A8+)"):'$"/"#$%&"$'+)':&"'_"9./'AP'%A8J?:#DA)#9'9+)*?+/D%/O'.+/%A?$/"'#)#9;/+/O'#).'#$D_%+#9'

–  !"##$&CD&%&''(,'-..$"//0'1!234!4'5676'-.8+$#9:;'<#;'DDD'!"##$&CD&%&''()*&''(>+/+,".?,'1!234!4O'5676'-.8+$#9:;'<#;O'@#$+)#'."9'=";O'2-'YMLYL''

•  !K83/*5&<%&–  !"##$)%&''(&+/':&"'$#*"B_99".O'.A8"/D%B#K?/+)*'%#$""$'%$+8+)#9'I&A'Q+99".'&+/'

cB;"#$BA9.'.#?*&:"$'#).'&"$'YB;"#$BA9.'P$+").O'I+:&'/%#$%"9;'DDD&–  !"##$)%&''(O'I&A'+/'#%%?/".'AP'Q+99+)*'&+/'cB;"#$BA9.'.#?*&:"$'#).'&"$'K"/:'DDD'

m)'<".)"/.#;O'#'X?.*"'.")+".'K#+9'PA$'!"##$)%&''(O'`5O'R&–  C?*+DE"/'x'!"##$)%&''(&B'T$+"P'B'C#:&"$'S")+".'T#+9'-I#+:/'G$+#9'CA$'2&+9.$")'/'

@?$."$/'\"$$;'T$#):A)'oAKK/'#%%?/".'AP':&"'/:#KK+)*'."#:&/'DDD&

•  !K83/*5&9%&–  !"##$)%&''(O'-?:&A$,'-'_b&'*")"$#DA)'P#$8"$O'<#;)"'2$;:/'_)+/&".'

&#$E"/D)*'&+/'%$AJ'+)':&"'P#99'AP']YcM'#).'&#?9".'8A$"':&#)'`LMMM'K?/&"9/'AP'/A;K"#)/'DDDD&

57'

5c'

4K83/*5&3-2NN*/3&

!"##$)%&''('+/':&"'$#*"B_99".O'.A8"/D%B

#K?/+)*'%#$""$'%$+8+)#9'I&A'Q+99".'&+/''!"##$)%&''(O'I&A'+/'#%%?/".'AP'Q+99+)*'&+/'YB;"#$BA9.'.#?*&:"$'#).'&"$'K"/:'

S$,'!"##$'=,'%&''('+/'#'J$A8+)"):'$"/"#$%&"$'+)'

:&"'_"9./'AP'%A8J?:#DA)#9'9+)*?+/D%'

!"##$'=,'%&''(,'-..$"//0'1!234!4'5676'-.8+$#9:;'<#;'D'

!"##$)%&''(O'#'_b&'*")"$#DA)'P#$8"$O'

<#;)"'2$;:/'_)+/&".'&#$E"/D)*'&+/'%$AJ'

s1 s2 … sn

teach 2 0 … 7

kill 10 2 … 3

child 1 3 … 0 /*Y/&3-2NN*/&5*N5*3*-/(.6-&

3-2NN*/&32)2K(52/0&

!6KK*4.6-&6L&/*Y/&3-2NN*/3&46-/(2-2-F&/?*&-()*&6L&2-/*5*3/&

G"N:'!)+JJ":'="J$"/"):#DA)'

•  G&"'%A):"N:'AP'"#%&'/)+JJ":'+/'$"J$"/"):".'K;'#'E"%:A$'I+:&'C'.+8")/+A)/'

•  (#%&'.+8")/+A)'+).+%#:"/'I&":&"$'#'J#$D%?9#$'P"#:?$"'

A%%?$$".'+)':&"'%A):"N:'

–  :&"'E#9?"'%#)'K"'K+)#$;O'P$"j?")%;'%A?):'":%,'

•  G&"'P"#:?$"/'%#J:?$"':&"'%&#$#%:"$+/D%/'AP':&"'%A):"N:':A'K"'%9?/:"$".'

•  4):?+DE"9;O'E"%:A$/3%A):"N:/':&#:'/&#$"':&"'/#8"'

P"#:?$"/'I+99'K"'/+8+9#$':A'"#%&'A:&"$'

5Y'

2A):"N:/'Z+)J?:':"N:'/)+JJ":/^'

•  2):]0' S$,' !"##$& CD& %&''(' ZKA$)' L[' \#)?#$;' ]Y5L^' +/' #'J$A8+)"):' $"/"#$%&"$' +)' :&"' _"9./' AP' %A8J?:#DA)#9'9+)*?+/D%/O'.+/%A?$/"'#)#9;/+/O'#).'#$D_%+#9'

•  2):L0'!"##$)%&''(& +/' :&"'$#*"B_99".O'.A8"/D%B#K?/+)*'%#$""$'%$+8+)#9'I&A'Q+99".'&+/'cB;"#$BA9.'.#?*&:"$'#).'&"$'YB;"#$BA9.'P$+").O'I+:&'/%#$%"9;'DDD&

•  2):`0' !"##$)%&''(O'-?:&A$,'-'_b&'*")"$#DA)' P#$8"$O'<#;)"'2$;:/' _)+/&".' &#$E"/D)*' &+/' %$AJ' +)' :&"' P#99' AP' ]YcM' #).'&#?9".'8A$"':&#)'`LMMM'K?/&"9/'AP'/A;K"#)/'DDD&

•  2):50' !"##$) %&''(O' I&A' +/' #%%?/".' AP' Q+99+)*' &+/' YB;"#$BA9.'.#?*&:"$'#).'&"$'K"/:'DDD'm)'<".)"/.#;O'#'X?.*"'.")+".'K#+9'PA$'!"##$)%&''(O'`5O'DDD&

[M'

G"N:'!)+JJ":'C"#:?$"/'Z]^'

•  1)+*$#8'y'#'/+)*9"'IA$.':&#:'A%%?$/'8A$"'

:&#)'#'*+E")')?8K"$'AP'D8"/'

[]'

M2KK& (5.O42(K& 5*3*(54?*5& R& +(8F?/*5&

2):]0' M' ]' ]' M'

2):L0' ]' M' M' ]'

2):`0' M' M' M' M'

2):50' ]' M' M' ]'

K+)#$;'E#9?"/'

G"N:'!)+JJ":'C"#:?$"/'Z]^'

•  1)+*$#8'y'#'/+)*9"'IA$.':&#:'A%%?$/'8A$"'

:&#)'#'*+E")')?8K"$'AP'D8"/'

' ' ' ''

[L'

• 'Q+99' ' ']MMM'

• '#$D_%+#9' '[MM'

• '$"/"#$%&"$ 'LMM'

n'

• '.#?*&:"$ ']MM ''

P$"j?")%;'"/D8#:".'P$A8'%A$J?/'

M2KK& (5.O42(K& 5*3*(54?*5& R& +(8F?/*5&

2):]0' M' [MM' LMM' M'

2):L0' ]MMM' M' M' ]MM'

2):`0' M' M' M' M'

2):50' ]MMM' M' M' ]MM'

P$"j?")%;'E#9?"/'

G"N:'!)+JJ":'C"#:?$"/'ZL^'

•  T+*$#8y'#)'A$."$".'J#+$'AP'IA$./':&#:'A%%?$'

:A*":&"$'8A$"'Ab")':&#)'"NJ"%:".'K;'%&#)%"'

[`'

M2KK&?23& N56)2-*-/&5*3*(54?*5& 452)2-(K&E?6& R& ZU0*(5U6K+&+(8F?/*5&

2):]0' M' ]' M' M'

2):L0' ]' M' ]' ]'

2):`0' M' M' M' M'

2):50' ]' M' M' ]'

K+)#$;'E#9?"/'

G"N:'!)+JJ":'C"#:?$"/'ZL^'•  T+*$#8y'#)'A$."$".'J#+$'AP'IA$./':&#:'A%%?$'

:A*":&"$'8A$"'Ab")':&#)'"NJ"%:".'K;'%&#)%"'

[5'

M2KK&?23& N56)2-*-/&5*3*(54?*5& 452)2-(K&E?6& R& ZU0*(5U6K+&+(8F?/*5&

2):]0' M' ]ML,Y' M' M'

2):L0' L],L' M' 6c,[' `[,Y'

2):`0' M' M' M' M'

2):50' L],L' M' M' `[,Y'

P$"j?")%;'I"+*&:/'

• 'Q+99'&+/ ' ' ' ''''L],L'

• 'J$A8+)"):'$"/"#$%&"$ '''']ML,Y'

• '%$+8+)#9'I&A' ' ''''6c,['

n'

• 'cB;"#$BA9.'.#?*&:"$ '''`[,Y'

''''''''''''''''''''O'9A*B9+Q"9+&AA.'/%A$"/'K#/".'A)'

P$"j?")%;'"/D8#:".'P$A8'%A$J?/'

!

"logP(w1 |w0)

G"N:'!)+JJ":'W$A?J+)*'

•  *$A?J':"N:'/)+JJ":/'K;'/+8+9#$'8"#)+)*'

•  /)+JJ":'/+8+9#$+:;'+/'%#9%?9#:".'#/''

[['

M2KK& (5.O42(K& 5*3*(54?*5& +(8F?/*5&

2):]0' M' ]' ]' M'

2):L0' ]' M' M' ]'

2):`0' M' M' M' M'

2):50' ]' M' M' ]'

!

sim(Cnt1,Cnt2) = w1i *w2ii=1

n"

/+8Z2):]O2):L^eZMl]^tZ]lM^tZ]lM^tZMl]^eM'

/+8Z2):]O2):`^eZMlM^tZ]lM^tZ]lM^tZMlM^eM'

/+8Z2):]O2):5^eZMl]^tZ]lM^tZ]lM^tZMl]^eM'

/+8Z2):LO2):`^eZ]lM^tZMlM^tZMlM^tZMlM^eM'

/+8Z2):LO2):5^eZ]l]^tZMlM^tZMlM^tZ]l]^eL'

/+8Z2):`O2):5^eZMl]^tZMlM^tZMlM^tZMl]^eM'

C+)#9'm?:J?:'

•  (#%&'%9?/:"$'%A)/+/:/'AP'#'%"$:#+)')?8K"$'AP'1"B1)(,8<<"1(D)8E"E'/8#99':"N:'P$#*8"):/,'

•  G&"'%9?/:"$/'%A$$"/JA).':A':&"'.+v"$"):'J"AJ9"'/&#$+)*':&"'/#8"')#8"'

– 29?/:"$]0'\"$$;'oAKK/':&"'#"("3#4*"#)– 29?/:"$L0'\"$$;'oAKK/':&"'C866"#)– 29?/:"$`0'\"$$;'oAKK/':&"'(8,>"#)

[6'

<"K'p"AJ9"'!"#$%&'2&#99")*"'

•  G&"'_$/:'%&#99")*"'I#/'A$*#)+q".'+)'LMM7'

•  <"p!'PA%?/"/'A)'J"$/A)'#).'A$*#)+q#DA)')#8"'

.+/#8K+*?#DA)'AP'<"K'J#*"/'

•  CA$'"#%&'#8K+*?A?/')#8"O':&"'/;/:"8'8?/:'$":?$)':&"'

.A%?8"):/'#).':&"'#V$+K?:"/'I&+%&'#$"'$"9"E#):'PA$':&"'

.+v"$"):'/")/"/'AP':&"')#8"'

•  U#/:'/?%&'%&#99")*"'I#/'A)']/:'AP'\?9;'LM]M'

•  @A$"'+)PA$8#DA)'#:0''&VJ033)9J,?)".,"/3I"J/3'

[7'

d#8"'S+/%$+8+)#DA)'S"8A'

•  !")/"29?/:"$/'K;'G".'p"."$/")'&VJ0338#$+8K#,.,?8),".?3%*+BK+)3!2B%*+3+)."N,%*+'

•  G&"'/AbI#$"'%#)'K"'?/".'PA$0'– J$AJ"$')#8"'.+/%$+8+)#DA)'

– IA$.'/")/"'.+/%$+8+)#DA)'

– "B8#+9'%9?/:"$+)*'

– /;)A);8'_).+)*'

[c'

<&#:'IA?9.';A?'.A'I+:&'%9?/:"$+)*a'

•  (B8#+9'%9?/:"$+)*'K;':AJ+%'

•  m$*#)+q"'.A%?8"):/'+):A'8?9DJ9"'%#:"*A$+"/'

9+Q"'WAA*9"')"I/'

•  G$#%"'I&#:':IA'J"AJ9"':#9Q'A)'GI+V"$'•  !")D8"):'#)#9;/+/''

•  n'

[Y'