{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "f857d1db-7d6d-4c76-af4f-d508a4027192", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "head: error reading 'data/x-stance/': Is a directory\n" ] } ], "source": [ "!head -2 data/x-stance/questions.en.json\n" ] }, { "cell_type": "code", "execution_count": 42, "id": "b6421d22-ca47-4ecf-b667-8f19f4cb035a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "questions.de.jsonl questions.en.jsonl\tquestions.fr.jsonl questions.it.jsonl\n" ] } ], "source": [ "!ls data/x-stance" ] }, { "cell_type": "code", "execution_count": 16, "id": "1d288f21-0b89-4974-bded-d5ca9ff24f82", "metadata": {}, "outputs": [], "source": [ "import json\n", "import pandas as pd\n", "import os" ] }, { "cell_type": "code", "execution_count": 48, "id": "67b7a07f-26d9-4e6a-8473-2614f6b34887", "metadata": {}, "outputs": [], "source": [ "json_list = []\n", "for line in open('data/x-stance/questions.it.jsonl'):\n", " json_list.append(json.loads(line))\n", "data = pd.DataFrame.from_dict(json_list)" ] }, { "cell_type": "code", "execution_count": 37, "id": "d505a983-7d08-4fa0-8281-99027e8edd4c", "metadata": {}, "outputs": [], "source": [ "k = 100\n", "seed_list = [100,13,21,42,87]" ] }, { "cell_type": "code", "execution_count": 39, "id": "824ff804-347d-4913-a3f0-20bd38cb4159", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "100-100 100-13 100-21 100-42 100-87 16-100 16-13\t16-21 16-42 16-87\n" ] } ], "source": [ "!ls /home/mist/projects/LM-BFF/data/k-shot/x-stance" ] }, { "cell_type": "code", "execution_count": 23, "id": "5ddb0ab8-0dfb-44bc-99c3-69829c86d8a7", "metadata": {}, "outputs": [], "source": [ "output_dir = '/home/mist/projects/LM-BFF/data/k-shot/'" ] }, { "cell_type": "code", "execution_count": 14, "id": "8b8b604e-5767-4bdb-b620-2d4c464c595c", "metadata": {}, "outputs": [], "source": [ "task = 'x-stance'" ] }, { "cell_type": "code", "execution_count": 38, "id": "59dd94e5-b067-4644-832f-bc0e45d3486f", "metadata": {}, "outputs": [], "source": [ "for seed in seed_list:\n", " task_dir = os.path.join(output_dir, task)\n", " setting_dir = os.path.join(task_dir, f\"{k}-{seed}\")\n", " os.makedirs(setting_dir, exist_ok=True)\n", " data.sample(random_state=seed,n=k)" ] }, { "cell_type": "code", "execution_count": 45, "id": "531acdd0-cbf6-430a-91e2-66923236908c", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>id</th>\n", " <th>text</th>\n", " <th>topic</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>2</td>\n", " <td>Finden Sie es grundsätzlich richtig, dass der ...</td>\n", " <td>Welfare</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>4</td>\n", " <td>Soll zusätzlich zur bestehenden Mutterschaftsv...</td>\n", " <td>Welfare</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>6</td>\n", " <td>Die Invalidenversicherung spricht bei nicht ob...</td>\n", " <td>Welfare</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>7</td>\n", " <td>Würden Sie eine nationale Spitalplanung befürw...</td>\n", " <td>Healthcare</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>9</td>\n", " <td>Finden Sie es richtig, dass einzelne ärztliche...</td>\n", " <td>Healthcare</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>189</th>\n", " <td>3464</td>\n", " <td>Würden Sie eine Ausdehnung der rechtlichen Mög...</td>\n", " <td>Security</td>\n", " </tr>\n", " <tr>\n", " <th>190</th>\n", " <td>3468</td>\n", " <td>Soll die Schweiz Verhandlungen über den Beitri...</td>\n", " <td>Foreign Policy</td>\n", " </tr>\n", " <tr>\n", " <th>191</th>\n", " <td>3469</td>\n", " <td>Soll der Bundesrat ein Freihandelsabkommen mit...</td>\n", " <td>Foreign Policy</td>\n", " </tr>\n", " <tr>\n", " <th>192</th>\n", " <td>3470</td>\n", " <td>Eine Initiative fordert, dass die Haftungsrege...</td>\n", " <td>Foreign Policy</td>\n", " </tr>\n", " <tr>\n", " <th>193</th>\n", " <td>3471</td>\n", " <td>Befürworten Sie die Kandidatur der Schweiz für...</td>\n", " <td>Foreign Policy</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>194 rows × 3 columns</p>\n", "</div>" ], "text/plain": [ " id text topic\n", "0 2 Finden Sie es grundsätzlich richtig, dass der ... Welfare\n", "1 4 Soll zusätzlich zur bestehenden Mutterschaftsv... Welfare\n", "2 6 Die Invalidenversicherung spricht bei nicht ob... Welfare\n", "3 7 Würden Sie eine nationale Spitalplanung befürw... Healthcare\n", "4 9 Finden Sie es richtig, dass einzelne ärztliche... Healthcare\n", ".. ... ... ...\n", "189 3464 Würden Sie eine Ausdehnung der rechtlichen Mög... Security\n", "190 3468 Soll die Schweiz Verhandlungen über den Beitri... Foreign Policy\n", "191 3469 Soll der Bundesrat ein Freihandelsabkommen mit... Foreign Policy\n", "192 3470 Eine Initiative fordert, dass die Haftungsrege... Foreign Policy\n", "193 3471 Befürworten Sie die Kandidatur der Schweiz für... Foreign Policy\n", "\n", "[194 rows x 3 columns]" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data" ] }, { "cell_type": "code", "execution_count": 49, "id": "bcf450cf-cb23-462f-b070-3529c1dfa86d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Infrastructure & Environment 31\n", "Economy 23\n", "Security 20\n", "Immigration 19\n", "Society 17\n", "Education 16\n", "Foreign Policy 16\n", "Finances 15\n", "Welfare 15\n", "Healthcare 11\n", "Political System 9\n", "Digitisation 2\n", "Name: topic, dtype: int64" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data['topic'].value_counts()" ] }, { "cell_type": "code", "execution_count": 16, "id": "b07328e8-5fcf-4fad-9468-49ff91652ef9", "metadata": {}, "outputs": [], "source": [ "for seed in seed_list:\n", " data.sample(random_state=seed,n=k)" ] }, { "cell_type": "code", "execution_count": 34, "id": "8d7aa681-bd14-4c48-881a-9130a9b88edc", "metadata": {}, "outputs": [], "source": [ "label_encoding = {\n", "'Infrastructure & Environment': 0,\n", "'Economy': 1 , \n", "'Security': 2 , \n", "'Immigration': 3 ,\n", "'Society': 4 ,\n", "'Education': 5 ,\n", "'Foreign Policy': 6 ,\n", "'Finances': 7 ,\n", "'Welfare':8 ,\n", "'Healthcare':9 ,\n", "'Political System': 10 , \n", "'Digitisation':11 }" ] }, { "cell_type": "code", "execution_count": 35, "id": "3c2fad2e-77cf-4eff-9dda-5773f59f4402", "metadata": {}, "outputs": [], "source": [ "data['label'] = data['topic'].apply(lambda x:label_encoding[x])" ] }, { "cell_type": "code", "execution_count": 40, "id": "6e4c0a41-31a8-4620-bfed-c776a21e0c1c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(194, 4)" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.shape" ] }, { "cell_type": "code", "execution_count": 4, "id": "4fc29374-8495-43b7-829f-90c11bf8a974", "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'pd' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m<ipython-input-4-a66617debac6>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mline\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'data/x-stance/questions.en.jsonl'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mjson_list\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloads\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mline\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfrom_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mjson_list\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'label'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m'+1'\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'stars'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m>=\u001b[0m\u001b[0;36m3\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;34m'-1'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msample\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfrac\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mNameError\u001b[0m: name 'pd' is not defined" ] } ], "source": [ "\n", "\n", "data['label']=data.apply(lambda x:'+1' if x['stars']>=3 else '-1',axis=1)\n", "data = data.sample(frac=1)\n", "data['text']=data['text'].apply(lambda x:' '.join(x.replace('\\n','.').split(' ')[:500]))\n" ] }, { "cell_type": "code", "execution_count": 9, "id": "551574a5-f461-4a6f-8c0a-97cf92747b74", "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'data' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m<ipython-input-9-c5d84736ba45>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdata\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mNameError\u001b[0m: name 'data' is not defined" ] } ], "source": [ "data" ] }, { "cell_type": "code", "execution_count": null, "id": "d3187648-9c75-423d-bc21-ba655e30df6f", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 5 }