{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f857d1db-7d6d-4c76-af4f-d508a4027192",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "head: error reading 'data/x-stance/': Is a directory\n"
     ]
    }
   ],
   "source": [
    "!head -2 data/x-stance/questions.en.json\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "b6421d22-ca47-4ecf-b667-8f19f4cb035a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "questions.de.jsonl  questions.en.jsonl\tquestions.fr.jsonl  questions.it.jsonl\n"
     ]
    }
   ],
   "source": [
    "!ls  data/x-stance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "1d288f21-0b89-4974-bded-d5ca9ff24f82",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import pandas as pd\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "67b7a07f-26d9-4e6a-8473-2614f6b34887",
   "metadata": {},
   "outputs": [],
   "source": [
    "json_list = []\n",
    "for line in open('data/x-stance/questions.it.jsonl'):\n",
    "  json_list.append(json.loads(line))\n",
    "data = pd.DataFrame.from_dict(json_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "d505a983-7d08-4fa0-8281-99027e8edd4c",
   "metadata": {},
   "outputs": [],
   "source": [
    "k = 100\n",
    "seed_list = [100,13,21,42,87]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "824ff804-347d-4913-a3f0-20bd38cb4159",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "100-100  100-13  100-21  100-42  100-87  16-100  16-13\t16-21  16-42  16-87\n"
     ]
    }
   ],
   "source": [
    "!ls /home/mist/projects/LM-BFF/data/k-shot/x-stance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "5ddb0ab8-0dfb-44bc-99c3-69829c86d8a7",
   "metadata": {},
   "outputs": [],
   "source": [
    "output_dir = '/home/mist/projects/LM-BFF/data/k-shot/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "8b8b604e-5767-4bdb-b620-2d4c464c595c",
   "metadata": {},
   "outputs": [],
   "source": [
    "task = 'x-stance'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "59dd94e5-b067-4644-832f-bc0e45d3486f",
   "metadata": {},
   "outputs": [],
   "source": [
    "for seed in seed_list:\n",
    "    task_dir = os.path.join(output_dir, task)\n",
    "    setting_dir = os.path.join(task_dir, f\"{k}-{seed}\")\n",
    "    os.makedirs(setting_dir, exist_ok=True)\n",
    "    data.sample(random_state=seed,n=k)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "531acdd0-cbf6-430a-91e2-66923236908c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>text</th>\n",
       "      <th>topic</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>Finden Sie es grundsätzlich richtig, dass der ...</td>\n",
       "      <td>Welfare</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4</td>\n",
       "      <td>Soll zusätzlich zur bestehenden Mutterschaftsv...</td>\n",
       "      <td>Welfare</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>6</td>\n",
       "      <td>Die Invalidenversicherung spricht bei nicht ob...</td>\n",
       "      <td>Welfare</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>7</td>\n",
       "      <td>Würden Sie eine nationale Spitalplanung befürw...</td>\n",
       "      <td>Healthcare</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>9</td>\n",
       "      <td>Finden Sie es richtig, dass einzelne ärztliche...</td>\n",
       "      <td>Healthcare</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>189</th>\n",
       "      <td>3464</td>\n",
       "      <td>Würden Sie eine Ausdehnung der rechtlichen Mög...</td>\n",
       "      <td>Security</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>190</th>\n",
       "      <td>3468</td>\n",
       "      <td>Soll die Schweiz Verhandlungen über den Beitri...</td>\n",
       "      <td>Foreign Policy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>191</th>\n",
       "      <td>3469</td>\n",
       "      <td>Soll der Bundesrat ein Freihandelsabkommen mit...</td>\n",
       "      <td>Foreign Policy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>192</th>\n",
       "      <td>3470</td>\n",
       "      <td>Eine Initiative fordert, dass die Haftungsrege...</td>\n",
       "      <td>Foreign Policy</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>193</th>\n",
       "      <td>3471</td>\n",
       "      <td>Befürworten Sie die Kandidatur der Schweiz für...</td>\n",
       "      <td>Foreign Policy</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>194 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       id                                               text           topic\n",
       "0       2  Finden Sie es grundsätzlich richtig, dass der ...         Welfare\n",
       "1       4  Soll zusätzlich zur bestehenden Mutterschaftsv...         Welfare\n",
       "2       6  Die Invalidenversicherung spricht bei nicht ob...         Welfare\n",
       "3       7  Würden Sie eine nationale Spitalplanung befürw...      Healthcare\n",
       "4       9  Finden Sie es richtig, dass einzelne ärztliche...      Healthcare\n",
       "..    ...                                                ...             ...\n",
       "189  3464  Würden Sie eine Ausdehnung der rechtlichen Mög...        Security\n",
       "190  3468  Soll die Schweiz Verhandlungen über den Beitri...  Foreign Policy\n",
       "191  3469  Soll der Bundesrat ein Freihandelsabkommen mit...  Foreign Policy\n",
       "192  3470  Eine Initiative fordert, dass die Haftungsrege...  Foreign Policy\n",
       "193  3471  Befürworten Sie die Kandidatur der Schweiz für...  Foreign Policy\n",
       "\n",
       "[194 rows x 3 columns]"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "bcf450cf-cb23-462f-b070-3529c1dfa86d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Infrastructure & Environment    31\n",
       "Economy                         23\n",
       "Security                        20\n",
       "Immigration                     19\n",
       "Society                         17\n",
       "Education                       16\n",
       "Foreign Policy                  16\n",
       "Finances                        15\n",
       "Welfare                         15\n",
       "Healthcare                      11\n",
       "Political System                 9\n",
       "Digitisation                     2\n",
       "Name: topic, dtype: int64"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data['topic'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "b07328e8-5fcf-4fad-9468-49ff91652ef9",
   "metadata": {},
   "outputs": [],
   "source": [
    "for seed in seed_list:\n",
    "    data.sample(random_state=seed,n=k)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "8d7aa681-bd14-4c48-881a-9130a9b88edc",
   "metadata": {},
   "outputs": [],
   "source": [
    "label_encoding = {\n",
    "'Infrastructure & Environment': 0,\n",
    "'Economy': 1                      ,  \n",
    "'Security':    2                   , \n",
    "'Immigration':   3                  ,\n",
    "'Society': 4                        ,\n",
    "'Education':   5                    ,\n",
    "'Foreign Policy': 6                 ,\n",
    "'Finances': 7                       ,\n",
    "'Welfare':8                         ,\n",
    "'Healthcare':9                      ,\n",
    "'Political System':  10              , \n",
    "'Digitisation':11 }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "3c2fad2e-77cf-4eff-9dda-5773f59f4402",
   "metadata": {},
   "outputs": [],
   "source": [
    "data['label'] = data['topic'].apply(lambda x:label_encoding[x])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "6e4c0a41-31a8-4620-bfed-c776a21e0c1c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(194, 4)"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "4fc29374-8495-43b7-829f-90c11bf8a974",
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'pd' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-4-a66617debac6>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mline\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'data/x-stance/questions.en.jsonl'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m   \u001b[0mjson_list\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloads\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mline\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfrom_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mjson_list\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      5\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'label'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m'+1'\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'stars'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m>=\u001b[0m\u001b[0;36m3\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;34m'-1'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msample\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfrac\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mNameError\u001b[0m: name 'pd' is not defined"
     ]
    }
   ],
   "source": [
    "\n",
    "\n",
    "data['label']=data.apply(lambda x:'+1' if x['stars']>=3 else '-1',axis=1)\n",
    "data = data.sample(frac=1)\n",
    "data['text']=data['text'].apply(lambda x:' '.join(x.replace('\\n','.').split(' ')[:500]))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "551574a5-f461-4a6f-8c0a-97cf92747b74",
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'data' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-9-c5d84736ba45>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdata\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m: name 'data' is not defined"
     ]
    }
   ],
   "source": [
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d3187648-9c75-423d-bc21-ba655e30df6f",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}