U
    ܿdu                     @   s   d Z ddlZddlZddlmZ ddlmZ ddlmZ ddlm	Z	 ddl
mZ ddl
mZ dd	l
mZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ zddlmZ W n   dddddZY nX G dd deZdS )z>
Appraise evaluation framework

See LICENSE for usage details
    N)defaultdict)OrderedDict)iglob)floor)basename)exists)sep)randint)	randrange)seed)shuffle)exit)BaseCommand)CommandError)LANGUAGE_CODES_AND_NAMESMalteseEnglishBasqueSpanish)mltengeusspac                   @   s.   e Zd ZdZdd Zdd Zeddd	Zd
S )Commandz6Creates JSON file containing DirectAssessmentTask datac                 C   s  |j dtdd |j dtdd |j dtdd |j dtd	d |j d
tdd |j dtdd |j dtdd |j dtd dd |j dtddd |j dtddd |j dtddd |j dddd |j d d!d"d |j d#tdd$d |j d%td&d |j d'd!d(d |j d)d!d*d |j d+d!d,d |j d-d!d.d |j d/d!d0d |j d1d!d2d |j d3d!d4d |j d5d!d6d |j d7d!d8d |j d9d!d:d d S );N
batch_sizezTotal batch size)typehelpsource_languagezSource language codetarget_languagezTarget language codesource_filezPath to source text filereference_filezPath to reference text filesystems_pathzPath to systems text folderoutput_json_filezPath to JSON output filez--urls-filez Path to optional image URLs file)r   defaultr   z--task-definitionz	80:5:5:10z;Defines (candidates, repeats, reference, bad refs) per taskz--required-annotations   z5Specifies required annotations per batch (default: 1)z--random-seedi@ zRandom generator seed valuez--randomizestore_falsezRandomize extracted work items)actionr   z--pad-batches
store_truez7Adds redundant batches to reach requested --max_batchesz
--batch-noz'Specifies desired batch no (default: 1)z--max-batchesz)Specifies max number of batches to createz--all-batchesz%Produces all possible batches at oncez--source-basedzCreates source-based work itemsz	--unicodez&Expects text files in Unicode encodingz--local-srcz'Loads source text from local .src filesz--local-refz*Loads reference text from local .ref filesz--create-idsz,Creates segment ids without local .ids filesz--full-coveragez"Ensures segments are fully coveredz--character-basedz=Enable character-based mode, default for Chinese and Japanesez--no-redundancyz=Disable redundant quality control, maximising data collectionz--ignore-emptyz&Replaces empty lines with "EMPTY_LINE")add_argumentintstr)selfparser r-   f/var/www/rival/public_html/translation-eval/EvalData/management/commands/CreateDirectAssessmentData.pyadd_arguments)   s          zCommand.add_argumentsc           l      O   s0  ~t tdd tD }|  |d  }||krb| jd| | jdd| d S |d  }||kr| jd| | jdd| d S |d	 }|d
 }|d }|d }	|d }
|d }|r| jd d}nzvz@dd |d 	dD }t
|dkr*|d dkr*t|}W n0 tk
r\   | jd|d  d}Y nX W 5 | jd| X |rdnd}|d }g }|st|d ||}tdt|  g }|st|d ||}td t|  g }|d! d k	r*t|d! ||}td"t|  g }|d# }d$|td%}t|D ]4}d&t|krttd'| td( || qL|d) }|  t| t| i }tt }|d*kp|d+kp|d*ks|d+kr|
p|d, }td-|  |D ]}t|||}|	s(t|d.d/||}nd0d tt|D }g }g }|rp|d.d1} t| rpt| ||}|r|d.d2}!t|!rt| ||}| D ]\}"}#|r||" n||" }|r||" n||" }$td3|# || | |$ | ! }%|r||" nd }&d4}'|r"|#n|#	d5}(t|(d4kr@d4}'nt|(d4krbt|(d6krbd7}'njt|(d6krt|(d8krd9}'nHt|(d8krt|(d:krd;}'n&t|(d:krt|(d<krd6}'nd=}'|rd7|' }'g })t|)|'krx|rt"dt|d4 nt"dt|d4 }*|
rF|r4t"dt|d4 nt"dt|d4 }*d }+||* }+|r`||* }+|+	d5})|r|+})qd },t|)|' dkrt"dt|)|' nd}-|)|-|-|'  },t|(|' dkrt"dt|(|' nd}-|(d |- |, |(|-|' d   }.d5|.}/|rd>|.}/|%| krh|"|#|/|$|t|gd?||%< |&rX||% #d@|&i ||" |% n||% dA t| qtdBt| t| qt$j%|d7dCdD}0|dE dF }1t&|1dGddH6}2| jjdI|1d>dJ |2t'|0 | jdK W 5 Q R X t | }3|3  t|3 |dL }4|4rvg }5|3D ]6}6|6|5kr:||6 dM }"||" }7|7  |5(|7 q:|5}3|d }8|8t|3|8  }9tdN|9|8t|3 |3(|3d|9  tdO|9 t)t*t|3|8 }:tdP|: |dQ };|dR }<|dS }=|=s|;d4 gn
t t|:}>|<r,|>d |< }>g }0|>D ]B}?g }@|?|d  }A|A|d  }B|3|A|B }C|C  t|C t td}Dt|D |d4d  \}E}F}Gt|E|F|G tdT|E tdU|F tdV|G d}H|E}I|D|H|I }J|I}H|I|F7 }I|D|H|I }K|I}H|I|G7 }I|D|H|I }LtdW|J tdX|K tdY|L dZd tdD }8t+|Cd d D ]b\}M}N|Nd[f|8|M< d }O|M|Jk	rnd\}On|M|Kk	r~d]}On|M|Lk	rd^}O|Od k		rF|N|Of|8|Md < 	qFg }PtdD ]&}M|8|Md  d k	r|P|Md  	qtd_|P t,|P|Cdd  D ]\}M}N|Nd[f|8|M< 	qtd`t|8 tdatdbd |8D  tdD ]>}Mt-dd4}Q|Qd4k
r@|8|M }R|8|Md  |8|M< |R|8|Md < 
q@dc}St|SD ]8}Tdc|T }Adc|Td4  }B|8|A|B }Udd|Ui}V|@|V 
q|

rt|d }W|
rde}Wnt|d }W|
rdf}Wt.|?d4 |dg |d |d |dh |di}Xg }Yd}Zt|SD ]&}T|@|T dd }U|UD ]\}[}\||[ }]|]dM }^|]dj }_|]dk }`|]dl }a|]dm }b|]/d@}c|]dA }dd&t0t|d}e|_}f|\d]krt|d }e|a}fn|\d^kr|`}ft. }g|Z|gdn< |Tdc|?  |gdo< |W|gdp< |a|gdq< |
r|b|gdq< |e|gdr< |f|gds< |^|gdt< |\|gdu< |cd k	rD|c|gdv< |Y|g |Zd47 }ZqJq4t.|X|Ydw}h|0|h q4|dx r|<d k	r|<t|0 }itdy|i t|iD ]}j|0|0|j  qt$j%|0d7dCdD}0|dE }kt&|kdGddH:}2| jjdI|dE d>dJ |2t'|0 | jdK W 5 Q R X d S )zNc                 S   s   g | ]}|  qS r-   )lower.0xr-   r-   r.   
<listcomp>   s     z"Command.handle.<locals>.<listcomp>r   zUnknown source language: {0}!zKnown languages: {0}z, r   zUnknown target language: {0}!unicode	local_src	local_ref
create_idssource_basedno_redundancyzNo redundancy set.)d   r   r   r   zUsing task definition: {0}c                 S   s   g | ]}t |qS r-   )r)   r1   r-   r-   r.   r4     s     Ztask_definition:r;   r   2   z Bad task definition value: {0!r})P      r?   
   utf16utf8ignore_emptyr   zLoaded {0} source segmentsr    zLoaded {0} reference segments	urls_filezLoaded {0} image URLsr!   z	{0}{1}{2}*+zUCannot use system files with + in names as this breaks multi-system meta systems:
{0}Zrandom_seedZzhoZjpncharacter_basedzcharacter_based = z.txtz.idsc                 S   s   g | ]}|d  qS )r$   r-   r1   r-   r-   r.   r4   g  s     z.srcz.refmd5r$    r?                         )
segment_idsegment_textsegment_badsegment_refsegment_srcsystemsZsegment_urlrX   zLoaded {0} system {1} segmentsT)indent	sort_keysr"   z	.segmentsw)modeencodingzCreating {0} ... )endingOKfull_coveragerS   zMissing items is {0}/{1}/{2}z%Added {0} missing items rotating keyszTotal number of batches is {0}batch_nomax_batchesall_batchesz
chk_items:z
ref_items:z
bad_items:zchk_ids:zref_ids:zbad_ids:c                 S   s   g | ]}d qS Nr-   )r2   _r-   r-   r.   r4   `  s     TGTCHKREFBADZempty_slotszlen(batch_items):zlen(batch_items) == None:c                 S   s   g | ]}|d kr|qS rd   r-   r1   r-   r-   r.   r4   z  s      r@   block_itemsZ	LOCAL_SRCZ	LOCAL_REFr   Zrequired_annotations)batchNoZ	batchSizeZsourceLanguageZtargetLanguagerequiredAnnotationsZ
randomSeedrT   rU   rV   rW   _item_blocksourceID
sourceTexttargetID
targetTextitemIDitemTypeimageURL)taskitemsZpad_batcheszpad_size = {0})1listsetr   sortr0   stdoutwriteformatjoinsplitsumtuple
ValueErrorr   _load_text_from_fileprintlenkeyspath_sepr   r   sys_exitappendr   r   r   replaceranger   rw   hashlibnewencode	hexdigestr
   updatejsondumpsopenr*   extendr)   r   	enumeratezipr	   r   getsorted)lr+   argsoptionsZ_allZ_srcZ_tgtZunicode_encZuse_local_srcZuse_local_refr8   r9   r:   Zitems_per_batchZtask_defr]   rC   r   r    rD   Zsystems_filesr!   Zsystems_globZsystem_fileZrandom_seed_valueZhashed_textZhashes_by_idsrH   Zsystem_pathZ
system_txt
system_idsr6   r7   Zlocal_src_pathZlocal_ref_pathrS   rT   Z_refZmd5hashZ_urlZ_bad_lenZ_tokensZ_bad_tokensZ_bad_idZ	_bad_textZ_bad_phraseZ_indexZ_badrU   Z	json_dataZsegments_file_nameout_fileZall_keysr`   Z_sorted_keyskeyZmatching_keysZbatch_itemsZmissing_itemsZtotal_batchesra   rb   rc   Z	batch_nosZbatch_idZ
block_dataZblock_startZ	block_endZblock_hashesZ	check_idsZ	chk_itemsZ	ref_itemsZ	bad_itemsstart_index	end_indexZchk_idsZref_idsZbad_idsindexZ	item_hash	item_typeZemtpy_slotsZrandom_swapZ
temp_valueZ
num_blocksZblock_idrj   current_blockZ	source_id	task_dataZ
items_datarm   Zcurrent_itemZcurrent_typeZ	item_dataitem_idZ	item_textZitem_badZitem_refZitem_srcZitem_urlZitem_systems	target_idZtarget_textobjoutput_dataZpad_sizeZ	pad_indexZjson_file_namer-   r-   r.   handle   s         

  

  
  
 


	
 
  



















zCommand.handlerB   Fc           	   	   C   s~   d}t  }t| |d^}|D ]R}|d7 }| }|sf|sNd| d}t|nd| d}d}t| |||< qW 5 Q R X |S )Nr   )r]   r$   zEmpty segment id=z2! Use --ignore-empty to replace with "EMPTY_LINE".z! Replaced with "EMPTY_LINE".Z
EMPTY_LINE)r   r   stripr   r   )		file_pathr]   rC   rS   Z	file_textZ
input_filecurrent_lineZcleaned_line_msgr-   r-   r.   r     s"    


zCommand._load_text_from_fileN)rB   F)__name__
__module____qualname__r   r/   r   staticmethodr   r-   r-   r-   r.   r   %   s    1    r   )__doc__r   r   collectionsr   r   globr   mathr   os.pathr   r   r   r   randomr	   r
   r   r   sysr   r   django.core.management.baser   r   Dashboard.modelsr   r   r-   r-   r-   r.   <module>   s2   
