{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import json\n",
    "from itertools import chain\n",
    "from math import ceil\n",
    "\n",
    "MAX_BATCH_SIZE = 100\n",
    "BLOCK_SIZE = 10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [],
   "source": [
    "def read_sents(filepath: str) -> list:\n",
    "    with open(filepath, mode=\"r\", encoding=\"utf-8\") as f:\n",
    "        sents = f.read().splitlines()\n",
    "    return sents"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import data\n",
    "SRC_LANG_CODE, TGT_LANG_CODE = \"mlt\", \"eng\"\n",
    "srcs = read_sents(\"/home/falcao/experiments/mt-en/mt-en.mt\")\n",
    "refs = read_sents(\"/home/falcao/experiments/mt-en/mt-en.en\")\n",
    "hyps_gtrans = read_sents(\"/home/falcao/experiments/mt-en/hyp-gtrans.en\")\n",
    "hyps_mstrans = read_sents(\"/home/falcao/experiments/mt-en/hyp-mstrans.en\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create DFs for each target type/ID and concat them in final DF\n",
    "\n",
    "df_gtrans = pd.DataFrame({\n",
    "    \"itemID\": range(len(srcs)),\n",
    "    \"itemType\": \"TGT\",\n",
    "    \"sourceID\": \"mt-en.mt\",\n",
    "    \"sourceText\": srcs,\n",
    "    \"targetID\": \"hyp-gtrans.en\",\n",
    "    \"targetText\": hyps_gtrans\n",
    "})\n",
    "\n",
    "df_mstrans = pd.DataFrame({\n",
    "    \"itemID\": range(len(srcs)),\n",
    "    \"itemType\": \"TGT\",\n",
    "    \"sourceID\": \"mt-en.mt\",\n",
    "    \"sourceText\": srcs,\n",
    "    \"targetID\": \"hyp-mstrans.en\",\n",
    "    \"targetText\": hyps_mstrans\n",
    "})\n",
    "\n",
    "df_ref = pd.DataFrame({\n",
    "    \"itemID\": range(len(srcs)),\n",
    "    \"itemType\": \"REF\",\n",
    "    \"sourceID\": \"mt-en.mt\",\n",
    "    \"sourceText\": srcs,\n",
    "    \"targetID\": \"mt-en.en\",\n",
    "    \"targetText\": refs,\n",
    "})\n",
    "\n",
    "df = pd.concat([df_gtrans, df_mstrans, df_ref])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [],
   "source": [
    "# randomize\n",
    "df = df.sample(len(df), random_state=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [],
   "source": [
    "n_batches = ceil(len(df) / MAX_BATCH_SIZE)\n",
    "batches_json = []\n",
    "\n",
    "for batch_id in range(n_batches):\n",
    "    # get section of DF that will be the batch\n",
    "    first = batch_id * MAX_BATCH_SIZE\n",
    "    last = first + MAX_BATCH_SIZE\n",
    "    batch_df = df[first:last].copy()\n",
    "\n",
    "    # create `_item` and `_block` columns according to block size\n",
    "    batch_df[\"_block\"] = list(chain(*[[block_id] * BLOCK_SIZE for block_id in range(ceil(len(batch_df) / BLOCK_SIZE))]))[:len(batch_df)]\n",
    "    batch_df[\"_item\"] = range(len(batch_df))\n",
    "\n",
    "    # reorder columns\n",
    "    if all(batch_df.columns[-2:] == [\"_block\", \"_item\"]):\n",
    "        cols = [\"_block\", \"_item\", *(batch_df.columns[:-2])]\n",
    "        batch_df = batch_df[cols]\n",
    "\n",
    "    # generate dict for the batch\n",
    "    batch_json = {\n",
    "        \"items\": batch_df.to_dict(orient=\"records\"),\n",
    "        \"task\": {\n",
    "            \"batchNo\": batch_id + 1, # 1-based\n",
    "            \"batchSize\": BLOCK_SIZE,\n",
    "            \"randomSeed\": 1,\n",
    "            \"requiredAnnotations\": 1,\n",
    "            \"sourceLanguage\": src_lang,\n",
    "            \"targetLanguage\": tgt_lang,\n",
    "        }\n",
    "    }\n",
    "\n",
    "    # add to json list of batches\n",
    "    batches_json.append(batch_json)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"/home/falcao/Appraise/MyCampaign/batches-3x100.json\", mode=\"w+\") as fout:\n",
    "    json.dump(batches_json, fout, indent=2)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.6"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
