
    i!                         S SK r S SKrS SKrS SKrS SKrS SKJrJr   S SKJ	r	  S rS rS rS rS rS	 rS
 rS rSS jrS rS rS rS rS r\S:X  a  \" 5         gg! \
 a    Sr	 NEf = f)    N)Counterdefaultdict)	LinearSVCc                 <    U S::  a  g[         R                  " U 5      $ )Nr           )mathlog1p)vs    tools/ltr/train_rank_ltr.py
log1p_safer      s    Av::a=    c                 J   U =(       d    SR                  5       R                  5       nU(       d  g/ nU HB  nUR                  5       (       d  US;   a  UR                  U5        M1  UR                  S5        MD     SR	                  U5      R                  S5      nU=(       d    S$ )Ndefaultz_-_ )striplowerisalnumappendjoin)r
   soutcs       r   norm_profile_namer      s}    	
i &&(A
C99;;!t)JJqMJJsO	 
 	3A>	r   c                    / n[        U SSS9 n[        R                  " U5      nU GH  nUR                  [	        UR                  SS5      5      R                  5       [	        UR                  SS5      5      R                  5       [        UR                  SS5      =(       d    S5      [        UR                  SS5      =(       d    S5      [        UR                  S	S5      =(       d    S5      [        UR                  S
S5      =(       d    S5      [        UR                  US5      5      S.5        GM     S S S 5        U Vs/ s H  oUS   (       d  M  UPM     nnU(       d  [        S5      eU$ ! , (       d  f       N?= fs  snf )Nr   utf-8)newlineencodingquery_idurllevelr   	title_lentext_lenlabelr   )r   r    r!   r"   r#   r$   intentzNo valid rows with query_id)
opencsv
DictReaderr   strgetr   floatr   
ValueError)path
intent_colrowsfrdrrs         r   	load_rowsr3   #   s)   D	dB	1QnnQAKK #AEE*b$9 : @ @ BquuUB/0668"155!#4#9:!&quu[!'<'A!B %aeeJ&:&?a @"155!#4#9:/j)0LM
  
2 -t!}AtD-677K# 
2	1 .s   D*E1FF1
E?c                 f   [        S U  5       5      n[        S U  5       5      =(       d    Sn[        S U  5       5      =(       d    Sn/ nU  H`  n[        U5      nU[        US   S5      -
  S-   U-  n[        US   5      U-  n[        US   5      U-  n	XxU	/US'   UR                  U5        Mb     U$ )	Nc              3   @   #    U  H  n[        US    S5      v   M     g7f)r!         ?N)max.0r2   s     r   	<genexpr>%normalize_features.<locals>.<genexpr>:   s     7$QC'
C(($s   c              3   >   #    U  H  n[        US    5      v   M     g7f)r"   Nr   r8   s     r   r:   r;   ;   s     ADq
1[>22D   r6   c              3   >   #    U  H  n[        US    5      v   M     g7f)r#   Nr=   r8   s     r   r:   r;   <   s     ?$Qz!J-00$r>   r!   r"   r#   x)r7   dictr   r   )
r/   	max_levelmax_title_logmax_text_logr   r2   rrf_levelf_titlef_texts
             r   normalize_featuresrI   9   s    7$77IADAAHSM?$??F3L
C!Ws2g;44s:iGR_-=BzN+l:V,3

2  Jr   c                 b    [        [        5      nU  H  nXS      R                  U5        M     U$ )Nr   )r   listr   )r/   by_qr2   s      r   group_by_queryrM   H   s0    tDz]""1% Kr   c                 X   / n/ n[        [        U 5      5       H  n[        [        U 5      5       Hl  nX4:X  a  M
  X   S   X   S   :  d  M  [        S5       Vs/ s H  oPU   S   U   X   S   U   -
  PM     nnUR                  U5        UR                  S5        Mn     M     X4$ s  snf )Nr$      r@      )rangelenr   )r/   r@   yijkds          r   make_pairwiserX   O   s    
A
A3t9s4y!Avww$''"22@EaI1!WS\!_tws|A6I "  4K Js   B'c                     [        U 5      nSnUR                  5        HF  n[        S U 5       5      nSn[        UR	                  5       5       H  nXF   nX'U-  -  nXW-  nM     MH     U$ )Nr   c              3   *   #    U  H	  oS    v   M     g7f)r$   N r8   s     r   r:   'count_positive_pairs.<locals>.<genexpr>a   s     6v!W:vs   )rM   valuesr   sortedkeys)r/   rL   pairsq_rowsby_labellower_totalr$   cnts           r   count_positive_pairsre   ]   sn    $DE++-6v66HMMO,E/C;&&EK -   Lr   c                     Sn[        U S U SS9 H*  u  p4USU-  S-
  [        R                  " US-   5      -  -  nM,     U$ )Nr   rP   )startg       @r6   )	enumerater   log2)labelsrV   r   rT   rels        r   dcgrl   j   sI    
CF2AJa0S3$))AG"444 1Jr   c                    ^ [        U U4S jSS9n[        U S SS9n[        U Vs/ s H  oUS   PM	     snU5      n[        U Vs/ s H  oUS   PM	     snU5      nUS:  a  Xg-  $ S$ s  snf s  snf )Nc                 D   >^  [        U U4S j[        S5       5       5      $ )Nc              3   @   >#    U  H  nTU   TS    U   -  v   M     g7f)r@   Nr[   )r9   rT   r2   ws     r   r:   3ndcg_for_query.<locals>.<lambda>.<locals>.<genexpr>r   s"     +OhAaD1S6!9,<hs   rO   )sumrQ   )r2   rp   s   `r   <lambda> ndcg_for_query.<locals>.<lambda>r   s    +OeAh+O(Or   T)keyreversec                     U S   $ )Nr$   r[   )r2   s    r   rs   rt   s   s    qzr   r$   r   r   )r^   rl   )r/   rp   rV   scoredidealr2   gotbests    `      r   ndcg_for_queryr|   q   sw    DOY]^F414@E
6*6az6*A
.CE*Eq'
E*A.D!8CJ,, +*s   A'A,c           	          / nU  H  nUR                  X%   5        M     [        U5      u  pg[        U5      S:  a  g [        c  [	        S5      e[        USS9nUR                  Xg5        UR                  S   R                  5       n	[        S U	 5       5      n
U	 Vs/ s H  n[        S[        U5      5      U
-  PM     n	n/ nU H  nUR                  [        X%   U	SS	95        M!     X(       a  [        U5      [        U5      -  S.$ S
S.$ s  snf )N   z3scikit-learn is required (pip install scikit-learn)i  )Cmax_iterr   c              3   L   #    U  H  n[        S [        U5      5      v   M     g7f)&.>N)r7   abs)r9   r
   s     r   r:   train_once.<locals>.<genexpr>   s     -1aD#a&!!1s   "$r   
   )rV   r   )rp   ndcg10)extendrX   rR   r   RuntimeErrorfitcoef_tolistrr   r7   r   r   r|   )train_qtest_qrL   c_value
train_rowsqr@   rS   modelrp   w_sumr
   ndcgss                r   
train_oncer   y   s    J$'" $DA
1vzPQQ$/E	IIaOAA-1--E,-.AqT3q6	U	"AA.E^DGQ"56 5s5zCJ6JJcJJ 	/s   "Dc                 Z  ^^^ [        S5       V^s/ s H$  m[        U4S jU  5       5      [        U 5      -  PM&     nn/ n[        S5       HO  mUT   m[        UU4S jU  5       5      [        U 5      -  nUR                  [        R
                  " U5      5        MQ     U(       a  [        U5      [        U5      -  OSmU(       a5  [        R
                  " [        U4S jU 5       5      [        U5      -  5      OSn[        U 5      US   US   US   S	.US   US   US   S	.TUS
.$ s  snf )NrO   c              3   ,   >#    U  H	  oT   v   M     g 7f)Nr[   )r9   r
   rT   s     r   r:   !aggregate_runs.<locals>.<genexpr>   s     &v!1vs   c              3   8   >#    U  H  oT   T-
  S -  v   M     g7fr~   Nr[   )r9   r
   rT   mus     r   r:   r      s     3FqQ4"9"Fs   r   c              3   2   >#    U  H  oT-
  S -  v   M     g7fr   r[   )r9   r@   	ndcg_means     r   r:   r      s     >Iq9}*Is   r   rP   r~   )r!   titletext)runs_okmean_wstd_wndcg10_mean
ndcg10_std)rQ   rr   rR   r   r   sqrt)	w_runs	ndcg_runsrT   meansstdsvarndcg_stdr   r   s	     `    @@r   aggregate_runsr      s   >CAhGhS&v&&V4hEGD1X1X3F33c&kADIIcN#  4=IY/#IT]		#>I>>YOPcf  v;!!HuQxqJ7T!Wd1gF   Hs   +D(c                 0   [        U 5      [        U  Vs1 s H  o3S   iM	     sn5      [        U 5      S.n[        U 5      n[        U5      n[	        UR                  5       5      n[        U5      UR                  :  a  S U4$ [        R                  " U5      n/ n	/ n
[        [        SUR                  5      5       H  nUS S  nUR                  U5        [        S[        [        U5      S-
  [        [        U5      UR                  -  5      5      5      nUS U nXS  n[!        XXaR"                  5      nU(       d  M  U	R%                  US   5        U
R%                  US   5        M     U	(       d  S U4$ ['        X5      nUR)                  U5        UU4$ s  snf )Nr   )r/   queriesr`      rP   rp   r   )rR   re   rI   rM   rK   r_   min_queries_per_intentrandomRandomrQ   r7   runsshufflemininttrain_ratior   r   r   r   update)r/   argsseedr2   metarows_nrL   r   rndr   r   r   r   cutr   r   r   reps                     r   train_profiler      sh   D	t4t!*t45%d+D  %F&!D499;G
7|d111Tz
--
CFI3q$))$%AJA!SQ!SQ$2B2B)B%CDEDS'4$7c#hX' & Tz

+CJJt9; 5s   F
c                 >   [        U SSS9 nUR                  S5        UR                  S5        UR                  5        H  u  p4UR                  SU S35        UR                  SUS	   S
   S S35        UR                  SUS	   S   S S35        UR                  SUS	   S   S S35        UR                  SUS   S
   S S35        UR                  SUS   S   S S35        UR                  SUS   S   S S35        M     S S S 5        g ! , (       d  f       g = f)Nrp   r   r   z0# Auto-generated by tools/ltr/train_rank_ltr.py
zE# Multi-profile format: [default], [blog], [docs], [ecommerce], ...

[z]
zw_level=r   r!   z.8f
zw_title=r   zw_text=r   z
std_level=r   z
std_title=z	std_text=z

)r&   writeitems)r-   profilesr0   namereports        r   
write_confr      s&   	dC'	*a	CD	YZ$NN,LDGGavSM"GGhvh/8=R@AGGhvh/8=R@AGGgfX.v6s;2>?GGj!9# >bABGGj!9# >bABGGiw 7<DAB - 
+	*	*s   C9D
Dc                     [         R                  " SS9n U R                  SSSS9  U R                  SSS	S
9  U R                  S[        SS9  U R                  S[        SS9  U R                  S[        SS9  U R                  S[        SS9  U R                  S[        SS9  U R                  SSS9  U R                  SSS9  U R                  5       n[        UR                  UR                  5      n[        X!UR                  5      u  p4U(       d  [        S5      e[        [        5      nU H  nXVS      R                  U5        M     SU0n0 n0 n	Sn
[        UR!                  5       5       HP  nUS:X  a  M  [        X[   XR                  U
-   5      u  pU
S-  n
U(       a
  XU'   XU'   M>  SUS   US    US!   S".X'   MR     [#        UR$                  U5        UUU	[        UR!                  5       5      S#.n['        UR(                  S$S%S&9 n[*        R,                  " XS'S(9  S S S 5        [/        S)5        [/        [*        R0                  " US'S(95        [/        S*UR$                   35        g ! , (       d  f       NP= f)+NzHTrain RankSVM-like linear LTR and export multi-profile rank_weights.conf)descriptionz--inputTz=CSV with query_id,url,level,title_len,text_len,label[,intent])requiredhelpz--intent-colr%   z0CSV column used as profile key (default: intent))r   r   z--runsd   )typer   z--train-ratiog?z--cr6   z--seed*   z--min-queries-per-intentrO   z
--out-confzrank_weights.conf)r   z
--out-jsonztools/ltr/ltr_report.jsonz.No valid training run produced default profiler   rP   insufficient_data_or_pairsr   r/   r`   )reasonr   r/   r`   )r   r   skippedprofiles_writtenrp   r   r   r~   )indentzTraining completezExported weights: )argparseArgumentParseradd_argumentr   r+   
parse_argsr3   inputr.   r   r   r   r   rK   r   r^   r_   r   out_confr&   out_jsonjsondumpprintdumps)apr   r/   
all_reportr   	by_intentr2   r   intent_reportsr   offsetr%   r   r   r   r0   s                   r   mainr      sJ   		 	 ^
B OOI3rOsOONH;mOnOOH3O4OOO%O=OOEsO3OOH3O3OO.S!ODOOL*=O>OOL*EOF==?DTZZ1D!$dii8MJKLLD!IH+%%a(  :&HNGF)*Y!)"3T99v;MN	!"V%(6" 7	?Vg	GO +  t}}h' " 1	F 
dmmS7	3q		&A& 
4 

	$**VA
&'	t}}o
./ 
4	3s   I((
I6__main__)r   )r   r'   r   r   r   collectionsr   r   sklearn.svmr   	Exceptionr   r   r3   rI   rM   rX   re   rl   r|   r   r   r   r   r   __name__r[   r   r   <module>r      s     
    ,%
,
-K*( FC;0| zF M  Is   A A*)A*