Session 7 - Second Hands-On

Assignment

1) Assignment 1 (500 points):

1) Submit the Assignment 5 as Assignment 1. To be clear,

1) ONLY use datasetSentences.txt. (no augmentation required)

2) Your dataset must have around 12k examples.

3) Split Dataset into 70/30 Train and Test (no validation)

4) Convert floating-point labels into 5 classes (0-0.2, 0.2-0.4, 0.4-0.6, 0.6-0.8, 0.8-1.0)

5) Upload to github and proceed to answer these questions asked in the S7 - Assignment Solutions, where these questions are asked:

1) Share the link to your github repo (100 pts for code quality/file structure/model accuracy)

2) Share the link to your readme file (200 points for proper readme file)

3) Copy-paste the code related to your dataset preparation (100 pts)

4) Share your training log text (you MUST have been testing for test accuracy after every epoch) (200 pts)

5) Share the prediction on 10 samples picked from the test dataset. (100 pts)

2) Assignment 2 (300 points):

1) Train model we wrote in the class on the following two datasets taken from this link (Links to an external site.):

1) http://www.cs.cmu.edu/~ark/QA-data/ (Links to an external site.)

2) https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs (Links to an external site.)

2) Once done, please upload the file to github and proceed to answer these questions in the S7 - Assignment Solutions, where these questions are asked:

1) Share the link to your github repo (100 pts for code quality/file structure/model accuracy) (100 pts)

2) Share the link to your readme file (100 points for proper readme file), this file can be the second part of your Part 1 Readme (basically you can have only 1 Readme, describing both assignments if you want) (100 pts)

3) Copy-paste the code related to your dataset preparation for both datasets. (100 pts)

Solution

Assignment 1

Link to GitHub Code

Link to Colab Code

Classification API Structure:

Link to Classification API

1) Dataset

Text

Label

Sentiment:

S. No.	Label
1	Very Negative
2	Negative
3	Neutral
4	Positive
5	Very Positive

Original Data

2) EDA

EDA - Original Dataset

Word Cloud for each of the 5 sentiments in Training Data

Word Cloud for each of the 5 sentiments in Test Data

From the above clouds, we can see that the most common appearing words like film and movie appear in all sentiments and so can be considered stopword for the dataset.

Data Extraction Code

class NLPClassificationDataset():
    def __init__(self, data_path, seed, batch_size, device, split_ratio=[0.7, 0.3]):
        self.split_ratio = split_ratio
        self.data_path = data_path
        self.seed = seed
        self.device = device
        self.batch_size = batch_size
        self.seq_data = self.load_data(self.data_path)

    def load_data(self, data_path):
        raise NotImplementedError
    
    def get_data(self):
        return self.seq_data
    
    def create_dataset(self, vectors = "glove.6B.100d", unk_init = torch.Tensor.normal_):
        try:
            SRC = Field(sequential = True, tokenize = 'spacy', batch_first =True, include_lengths=True)
            TRG = data.LabelField(tokenize ='spacy', is_target=True, batch_first =True, sequential =True)
            fields = [('src', SRC),('trg', TRG)]
            example = [data.Example.fromlist([self.seq_data.src[i],self.seq_data.trg[i]], fields) for i in range(self.seq_data.shape[0])] 
            
            # Creating dataset
            Dataset = data.Dataset(example, fields)
            (self.train_data, self.test_data) = Dataset.split(split_ratio=self.split_ratio, random_state=random.seed(self.seed))
            
            print(f"Number of training examples: {len(self.train_data.examples)}")
            print(f"Number of testing examples: {len(self.test_data.examples)}")

            # build vocabulary
            if vectors is None and unk_init is None:
                SRC.build_vocab(self.train_data)
            else:
                SRC.build_vocab(self.train_data,  
                 vectors = vectors, 
                 unk_init = unk_init)

            TRG.build_vocab(self.train_data)

            print(f"Unique tokens in source vocabulary: {len(SRC.vocab)}")
            print(f"Unique tokens in target vocabulary: {len(TRG.vocab)}")

            return self.train_data, self.test_data, SRC, TRG, self.seq_data
        except Exception as e:
            raise e
    
    def iterate_dataset(self):
        try:
            self.train_data, self.test_data, SRC, TRG, data = self.create_dataset()
            self.train_iterator, self.test_iterator = BucketIterator.splits((self.train_data, self.test_data),batch_size=self.batch_size,sort_key = lambda x: len(x.src),sort_within_batch=True,device = self.device)
        
            return self.train_iterator, self.test_iterator, SRC, TRG, data
        except Exception as e:
            raise e

## SST Dataset
from .NLPClassificationDataset import NLPClassificationDataset

class SSTDataset(NLPClassificationDataset):
    def __init__(self, data_path, seed, batch_size, device, split_ratio=[0.7, 0.3]):
        # super(QuoraDataset, self).__init__(data_path, seed, batch_size, device, split_ratio)
        self.split_ratio = split_ratio
        self.data_path = data_path
        self.seed = seed
        self.device = device
        self.batch_size = batch_size

        self.ranges = [0, 0.2, 0.4, 0.6, 0.8, 1.0]
        self.labels = ['very negative', 'negative', 'neutral', 'positive', 'very positive']
        self.label = [0, 1, 2, 3, 4]

        self.seq_data = self.load_data(self.data_path)
    
    def get_labels(self):
        return self.labels

    def load_data(self, sst_path):
        sst_sents = pd.read_csv(os.path.join(sst_path, 'datasetSentences.txt'), delimiter='\t')
        sst_phrases = pd.read_csv(os.path.join(sst_path, 'dictionary.txt'), delimiter='|', names=['phrase','phrase_id'])
        sst_labels  = pd.read_csv(os.path.join(sst_path, 'sentiment_labels.txt'), delimiter='|')
        
        sst_sentences_phrases = pd.merge(sst_sents, sst_phrases, how='inner', left_on=['sentence'], right_on=['phrase'])
        sst = pd.merge(sst_sentences_phrases, sst_labels, how='inner', left_on=['phrase_id'], right_on=['phrase ids'])[['sentence','sentiment values']]
        sst['labels'] = pd.cut(sst['sentiment values'], bins=self.ranges, labels=self.labels, include_lowest=True)
        sst_data = sst[['sentence', 'labels']]
        sst_data.columns = ['src','trg']

        return sst_data

3) Model Building

Model Code:

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class NLPBasicClassifier(nn.Module):
    
    # Define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim1, hidden_dim2, output_dim, n_layers, bidirectional, dropout, pad_index):
        # Constructor
        super().__init__()

        # embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_index)

        # lstm layer
        self.lstm = nn.LSTM(embedding_dim,
                            hidden_dim1,
                            num_layers=n_layers,
                            bidirectional=bidirectional,
                            batch_first=True)
        self.fc1 = nn.Linear(hidden_dim1 * 2, hidden_dim2)
        self.fc2 = nn.Linear(hidden_dim2, output_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        # activation function
        self.act = nn.Softmax() #\ F.log_softmax(outp)

    def forward(self, text, text_lengths):
        # text = [batch size,sent_length]
        embedded = self.embedding(text)
        # embedded = [batch size, sent_len, emb dim]

        # packed sequence
        packed_embedded = pack_padded_sequence(embedded, text_lengths.to('cpu'), batch_first=True) # unpad

        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        # packed_output shape = (batch, seq_len, num_directions * hidden_size)
        # hidden shape  = (num_layers * num_directions, batch, hidden_size)

        # concat the final forward and backward hidden state
        cat = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        # output, output_lengths = pad_packed_sequence(packed_output)  # pad the sequence to the max length in the batch

        rel = self.relu(cat)
        dense1 = self.fc1(rel)

        drop = self.dropout(dense1)
        preds = self.fc2(drop)

        return preds

Model Structure:-

NLPBasicClassifier(
  (embedding): Embedding(16388, 100, padding_idx=1)
  (lstm): LSTM(100, 256, num_layers=2, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=512, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=5, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.2, inplace=False)
  (act): Softmax(dim=None)
)
The model has 4,015,253 trainable parameters

4) Training and Testing

Training Logs:

Epoch: 01 | Epoch Time: 0m 1s
	Train Loss: 1.533 | Train Acc: 29.72%
	 Val. Loss: 1.472 |  Val. Acc: 34.75% 

 99%|█████████▉| 397360/400000 [00:29<00:00, 27112.99it/s]Epoch: 02 | Epoch Time: 0m 1s
	Train Loss: 1.377 | Train Acc: 38.91%
	 Val. Loss: 1.344 |  Val. Acc: 41.22% 

Epoch: 03 | Epoch Time: 0m 1s
	Train Loss: 1.237 | Train Acc: 44.73%
	 Val. Loss: 1.327 |  Val. Acc: 42.01% 

Epoch: 04 | Epoch Time: 0m 1s
	Train Loss: 1.142 | Train Acc: 48.97%
	 Val. Loss: 1.379 |  Val. Acc: 40.51% 

Epoch: 05 | Epoch Time: 0m 1s
	Train Loss: 0.997 | Train Acc: 56.26%
	 Val. Loss: 1.435 |  Val. Acc: 43.44% 

Epoch: 06 | Epoch Time: 0m 1s
	Train Loss: 0.823 | Train Acc: 65.49%
	 Val. Loss: 1.598 |  Val. Acc: 40.62% 

Epoch: 07 | Epoch Time: 0m 1s
	Train Loss: 0.682 | Train Acc: 72.18%
	 Val. Loss: 1.883 |  Val. Acc: 39.53% 

Epoch: 08 | Epoch Time: 0m 1s
	Train Loss: 0.508 | Train Acc: 80.66%
	 Val. Loss: 2.120 |  Val. Acc: 40.53% 

Epoch: 09 | Epoch Time: 0m 1s
	Train Loss: 0.369 | Train Acc: 86.48%
	 Val. Loss: 2.592 |  Val. Acc: 37.79% 

Epoch: 10 | Epoch Time: 0m 1s
	Train Loss: 0.268 | Train Acc: 90.00%
	 Val. Loss: 3.311 |  Val. Acc: 38.86% 

Training aand Testing Visualization

Train vs Test Accuracy

Train vs Test Loss

5) Prediction

10 Correctly Classified Texts From Test Data

****************************************
***** Correctly Classified Text: *******
****************************************
1) Text: A perfectly pleasant if slightly  comedy .
   Target Sentiment: positive
   Predicted Sentiment: positive

2) Text: Unfolds as one of the most politically audacious films of recent decades from any country , but especially from France .
   Target Sentiment: positive
   Predicted Sentiment: positive

3) Text: There 's a sheer unbridled delight in the way the story  ... 
   Target Sentiment: positive
   Predicted Sentiment: positive

4) Text: The Four Feathers is definitely   , but if you go in knowing that , you might have fun in this cinematic  . 
   Target Sentiment: positive
   Predicted Sentiment: positive

5) Text: ... the story , like  's Bolero , builds to a  that encompasses many more paths than we started with .
   Target Sentiment: positive
   Predicted Sentiment: positive

6) Text: A comedy that is warm , inviting , and surprising . 
   Target Sentiment: very positive
   Predicted Sentiment: very positive

7) Text:  and largely devoid of the depth or  that would make watching such a graphic treatment of the crimes  .
   Target Sentiment: negative
   Predicted Sentiment: negative

8) Text: can be as tiresome as 9 seconds of Jesse  '  Castro  , which are 
   Target Sentiment: negative
   Predicted Sentiment: negative

9) Text:  , I 'd rather watch them on the Animal Planet . 
   Target Sentiment: very negative
   Predicted Sentiment: very negative

10) Text: Good -   sequel . 
   Target Sentiment: positive
   Predicted Sentiment: positive

10 Incorrectly Classified Texts From Test Data

****************************************
***** Incorrectly Classified Text: *******
****************************************
1) Text: The movie is n't just hilarious : It 's witty and inventive , too , and in hindsight , it is n't even all that dumb .
   Target Sentiment: very positive
   Predicted Sentiment: positive

2) Text: Stands as a document of what it felt like to be a New Yorker -- or , really , to be a human being -- in the weeks after 9\/11 .  
   Target Sentiment: positive
   Predicted Sentiment: neutral

3) Text: It works its magic with such exuberance and passion that the film 's length becomes a part of its fun .
   Target Sentiment: very positive
   Predicted Sentiment: positive

4) Text: It does n't do the original any particular  , but neither does it exude any charm or personality .
   Target Sentiment: negative
   Predicted Sentiment: neutral

5) Text: Do n't expect any surprises in this checklist of  cliches ... 
   Target Sentiment: very negative
   Predicted Sentiment: negative

6) Text: The film 's tone and pacing are off almost from the get - go .
   Target Sentiment: negative
   Predicted Sentiment: neutral

7) Text: It 's deep -  by a  to  every bodily  gag in There 's Something About Mary and  a parallel clone - gag . 
   Target Sentiment: neutral
   Predicted Sentiment: negative

8) Text: The lower your expectations , the more you 'll enjoy it . 
   Target Sentiment: negative
   Predicted Sentiment: positive

9) Text: A film without surprise geared toward maximum comfort and familiarity . 
   Target Sentiment: negative
   Predicted Sentiment: positive

10) Text: A chiller resolutely without chills . 
   Target Sentiment: very negative
   Predicted Sentiment: neutral

6) Evaluation

Confusion Matrix on testing data

Evaluation Metrics on Test Data

F1 Macro Score: 0.38181634331169134

Accuracy: 38.98405197873597 %

Assignment 2

The datasets are subclasses of:

import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.legacy import data
from torchtext.legacy.data import Field, BucketIterator

import spacy
import numpy as np

import random
import math
import time

class SeqDataset():
    def __init__(self, data_path, seed, batch_size, device, split_ratio=[0.7, 0.3]):
        self.split_ratio = split_ratio
        self.data_path = data_path
        self.seed = seed
        self.device = device
        self.batch_size = batch_size
        self.seq_data = self.load_data(self.data_path)
        
    def load_data(self, data_path):
        raise NotImplementedError
    
    def get_data(self):
        return self.seq_data
    
    def create_dataset(self):
        try:
            SRC = Field(tokenize = 'spacy',init_token = '<sos>',eos_token = '<eos>',lower = True)
            TRG = Field(tokenize = 'spacy',init_token = '<sos>',eos_token = '<eos>',lower = True)
            fields = [('src', SRC),('trg', TRG)]
            example = [data.Example.fromlist([self.seq_data.src[i],self.seq_data.trg[i]], fields) for i in range(self.seq_data.shape[0])] 
            
            # Creating dataset
            Dataset = data.Dataset(example, fields)
            (self.train_data, self.test_data) = Dataset.split(split_ratio=self.split_ratio, random_state=random.seed(self.seed))
            
            print(f"Number of training examples: {len(self.train_data.examples)}")
            print(f"Number of testing examples: {len(self.test_data.examples)}")

            # build vocabulary
            SRC.build_vocab(self.train_data, min_freq = 2)
            TRG.build_vocab(self.train_data, min_freq = 2)

            print(f"Unique tokens in source vocabulary: {len(SRC.vocab)}")
            print(f"Unique tokens in target vocabulary: {len(TRG.vocab)}")

            return self.train_data, self.test_data, SRC, TRG
        except Exception as e:
            raise e
    
    def iterate_dataset(self):
        try:
            self.train_data, self.test_data, SRC, TRG = self.create_dataset()
            self.train_iterator, self.test_iterator = BucketIterator.splits((self.train_data, self.test_data),batch_size=self.batch_size,sort_key = lambda x: len(x.src),sort_within_batch=True,device = self.device)
        
            return self.train_iterator, self.test_iterator, SRC, TRG, self.seq_data
        except Exception as e:
            raise e

1) Wikipedia QA Data

Task: Answer Questions

Description:

There are three directories, one for each year of students: S08, S09, and S10.

The file “question_answer_pairs.txt” contains the questions and answers. The first line of the file contains column names for the tab-separated data fields in the file. This first line follows:

ArticleTitle Question Answer DifficultyFromQuestioner DifficultyFromAnswerer ArticleFile

Field 1 is the name of the Wikipedia article from which questions and answers initially came.

Field 2 is the question.

Field 3 is the answer.

Field 4 is the prescribed difficulty rating for the question as given to the question-writer.

Field 5 is a difficulty rating assigned by the individual who evaluated and answered the question, which may differ from the difficulty in field 4.

Field 6 is the relative path to the prefix of the article files. html files (.htm) and cleaned text (.txt) files are provided.

Link to data

Data Exploration

The data consist of: [‘S09’, ‘S08’, ‘S10’, ‘LICENSE-S08,S09’, ‘README.v1.2’]

There are 3 folders: S08, S09 and S10. The other 2 are files.

Within each of S08, S09, S10, there are:

1) data folder containing wikipedia articles

2) question answer pair text file

We need to consider this question answer text file.

gdrive/MyDrive/TSAI_END2/Session7/Assignment2/data/QuestionAnswerData/Question_Answer_Dataset_v1.2/S09
['data', 'question_answer_pairs.txt']

gdrive/MyDrive/TSAI_END2/Session7/Assignment2/data/QuestionAnswerData/Question_Answer_Dataset_v1.2/S08
['data', 'question_answer_pairs.txt']

gdrive/MyDrive/TSAI_END2/Session7/Assignment2/data/QuestionAnswerData/Question_Answer_Dataset_v1.2/S10
['data', 'question_answer_pairs.txt']

gdrive/MyDrive/TSAI_END2/Session7/Assignment2/data/QuestionAnswerData/Question_Answer_Dataset_v1.2/LICENSE-S08,S09

gdrive/MyDrive/TSAI_END2/Session7/Assignment2/data/QuestionAnswerData/Question_Answer_Dataset_v1.2/README.v1.2

All 3 ‘question_answer_pairs.txt’ were read and combined. The result is below.

	ArticleTitle	Question	Answer	DifficultyFromQuestioner	DifficultyFromAnswerer	ArticleFile
Alessandro_Volta	Was Volta an Italian physicist?	yes	easy	easy	data/set4/a10
Alessandro_Volta	Was Volta an Italian physicist?	yes	easy	easy	data/set4/a10
Alessandro_Volta	Is Volta buried in the city of Pittsburgh?	no	easy	easy	data/set4/a10
Alessandro_Volta	Is Volta buried in the city of Pittsburgh?	no	easy	easy	data/set4/a10
Alessandro_Volta	Did Volta have a passion for the study of elec...	yes	easy	medium	data/set4/a10
...	...	...	...	...	...	...
Zebra	What areas do the Grevy's Zebras inhabit?	NaN	hard	NaN	data/set1/a9
Zebra	Which species of zebra is known as the common ...	Plains Zebra (Equus quagga, formerly Equus bur...	hard	medium	data/set1/a9
Zebra	Which species of zebra is known as the common ...	Plains Zebra	hard	medium	data/set1/a9
Zebra	At what age can a zebra breed?	five or six	hard	medium	data/set1/a9
Zebra	At what age can a zebra breed?	5 or 6	hard	hard	data/set1/a9
rows × 6 columns

After that, only Question and Answer columns were extracted to form the below data:

	Question						Answer
Was Volta an Italian physicist?				yes
Was Volta an Italian physicist?				yes
Is Volta buried in the city of Pittsburgh?		no
Is Volta buried in the city of Pittsburgh?		no
Did Volta have a passion for the study of elec...	yes
...	...	...
What areas do the Grevy's Zebras inhabit?		NaN
Which species of zebra is known as the common ...	Plains Zebra (Equus quagga, formerly Equus bur...
Which species of zebra is known as the common ...	Plains Zebra
At what age can a zebra breed?				five or six
At what age can a zebra breed?				5 or 6
rows × 2 columns

There are some null values also. So, we need to remove those rows.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3998 entries, 0 to 3997
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Question  3961 non-null   object
 1   Answer    3422 non-null   object
dtypes: object(2)
memory usage: 62.6+ KB

After removing, the final data has 3422 rows:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3422 entries, 0 to 3421
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Question  3422 non-null   object
 1   Answer    3422 non-null   object
dtypes: object(2)
memory usage: 53.6+ KB

Link to Github Code

Link to Colab Code

Code:

import torch
from torchtext.legacy import data
from torchtext.legacy.data import Field, BucketIterator

import os
import pandas as pd

from .SeqDataset import SeqDataset

class WikiDataset(SeqDataset):
    def __init__(self, data_path, seed, batch_size, device, split_ratio=[0.7, 0.3]):
        # super(WikiDataset, self).__init__(data_path, seed, batch_size, device, split_ratio)
        self.split_ratio = split_ratio
        self.data_path = data_path
        self.seed = seed
        self.device = device
        self.batch_size = batch_size
        self.seq_data = self.load_data(self.data_path)

    def load_data(self, data_path):
        seq_data = pd.DataFrame()
        for subdir in os.listdir(data_path):
            subdirectory = os.path.join(data_path, subdir)
            if os.path.isdir(subdirectory):
                for txt_file in os.listdir(subdirectory):
                    if '.txt' in txt_file:
                        df = pd.read_csv(os.path.join(subdirectory, txt_file), sep='\t', encoding=' ISO-8859-1')
                        seq_data = pd.concat([seq_data, df]).reset_index(drop=True)

        seq_data = seq_data[['Question','Answer']]
        seq_data = seq_data.dropna().reset_index(drop=True)
        seq_data.columns = ['src','trg']

        return seq_data
    

2) Quora Data

Task: Generate Duplicate Question based on a question

Description:

Quora dataset consists of over 400,000 lines of potential question duplicate pairs. Each line contains IDs for each question in the pair, the full text for each question, and a binary value that indicates whether the line truly contains a duplicate pair.

Sample Data:

	id	qid1	qid2	question1						question2						is_duplicate
0	1	2	What is the step by step guide to invest in sh...	What is the step by step guide to invest in sh...	0
1	3	4	What is the story of Kohinoor (Koh-i-Noor) Dia...	What would happen if the Indian government sto...	0
2	5	6	How can I increase the speed of my internet co...	How can Internet speed be increased by hacking...	0
3	7	8	Why am I mentally very lonely? How can I solve...	Find the remainder when [math]23^{24}[/math] i...	0
4	9	10	Which one dissolve in water quikly sugar, salt...	Which fish would survive in salt water?			0

id - unique questions pair id

qid1 - question 1 id

qid2 - question 2 id

question1 - full text of question 1

question2 - full text of question 2

is_duplicate - 0 indicates not a duplicate pair while 1 indicates dulicate questions

We need to consider question1 and question2 columns for our purpose. Among all rows, only those rows have to be considered where is_duplicate=1.

	question1						question2
0	Astrology: I am a Capricorn Sun Cap moon and c...	I'm a triple Capricorn (Sun, Moon and ascendan...
1	How can I be a good geologist?				What should I do to be a great geologist?
2	How do I read and find my YouTube comments?		How can I see all my Youtube comments?
3	What can make Physics easy to learn?			How can you make physics easy to learn?
4	What was your first sexual experience like?		What was your first sexual experience?
...	...	...
149258	What are some outfit ideas to wear to a frat p...	What are some outfit ideas wear to a frat them...
149259	Why is Manaphy childish in Pokémon Ranger and ...	Why is Manaphy annoying in Pokemon ranger and ...
149260	How does a long distance relationship work?		How are long distance relationships maintained?
149261	What does Jainism say about homosexuality?		What does Jainism say about Gays and Homosexua...
149262	Do you believe there is life after death?		Is it true that there is life after death?

Since there are no nulls in above data, we’ll be using this data

Link to Github Code

Link to Colab Code

Code:

import torch
from torchtext.legacy import data
from torchtext.legacy.data import Field, BucketIterator

import pandas as pd

from .SeqDataset import SeqDataset

class QuoraDataset(SeqDataset):
    def __init__(self, data_path, seed, batch_size, device, split_ratio=[0.7, 0.3]):
        # super(QuoraDataset, self).__init__(data_path, seed, batch_size, device, split_ratio)
        self.split_ratio = split_ratio
        self.data_path = data_path
        self.seed = seed
        self.device = device
        self.batch_size = batch_size
        self.seq_data = self.load_data(self.data_path)

    def load_data(self, data_path):
        seq_data = pd.read_csv(data_path, sep='\t')
        seq_data = seq_data[seq_data['is_duplicate']==1][['question1','question2']]
        seq_data.reset_index(drop=True,inplace=True)
        seq_data.columns = ['src','trg']

        return seq_data
    

Additional Datasets

1) AmbigNQ Light Data

Sequence to Sequence Prediction on AmbigQA Light Data

Task: Answer Question

Description:

AmbigNQ is a dataset covering 14,042 questions from NQ-open, an existing open-domain QA benchmark. We find that over half of the questions in NQ-open are ambiguous.

We provide two distributions of our new dataset AmbigNQ: a full version with all annotation metadata and a light version with only inputs and outputs.

Here, Iam usig light version.

The light version contains

train_light.json (3.3M)
dev_light.json (977K)

{train|dev}_light.json files contains a list of dictionary that represents a single datapoint, with the following keys:

id (string): an identifier for the question, consistent with the original NQ dataset.
question (string): a question. This is identical to the question in the original NQ except we postprocess the string to start uppercase and end with a question mark.
annotations (a list of dictionaries): a list of all acceptable outputs, where each output is a dictionary that represents either a single answer or multiple question-answer pairs.
- type: singleAnswer or multipleQAs
- (If type is singleAnswer) answer: a list of strings that are all acceptable answer texts
- (If type is multipleQAs) qaPairs: a list of dictionaries with question and answer. question is a string, and answer is a list of strings that are all acceptable answer texts

Sample Data:

	annotations						id			question
[{'type': 'multipleQAs', 'qaPairs': [{'questio...	-4469503464110108318	When did the simpsons first air on television?
[{'type': 'singleAnswer', 'answer': ['David Mo...	4790842463458965203	Who played george washington in the john adams...
[{'type': 'multipleQAs', 'qaPairs': [{'questio...	-6631915997977101143	What is the legal age of marriage in usa?
[{'type': 'multipleQAs', 'qaPairs': [{'questio...	-3098213414945179817	Who starred in barefoot in the park on broadway?
[{'type': 'multipleQAs', 'qaPairs': [{'questio...	-927805218867163489	When did the manhattan project began and end?
...	...	...	...
[{'type': 'multipleQAs', 'qaPairs': [{'questio...	8643122201694054820	When do the summer holidays start for schools?
[{'type': 'multipleQAs', 'qaPairs': [{'questio...	9033094464364994905	Who is the band in the movie 10 things i hate ...
[{'type': 'singleAnswer', 'answer': ['Gwynne E...	9101518012234561119	Who was the last person in the uk to be executed?
[{'type': 'multipleQAs', 'qaPairs': [{'questio...	926954766593964346	Who does wonder woman end up with in the comics?
[{'type': 'multipleQAs', 'qaPairs': [{'questio...	98262964342640738	When were the first pair of jordans released?
rows × 3 columns

After expanding annotations column:

	type		qaPairs							answer	question
multipleQAs	[{'question': 'When did the Simpsons first air...	NaN	When did the simpsons first air on television?
singleAnswer	NaN	[David Morse]	Who played george washington in the john adams...
multipleQAs	[{'question': 'What is the legal age of marria...	NaN	What is the legal age of marriage in usa?
multipleQAs	[{'question': 'Who starred in barefoot in the ...	NaN	Who starred in barefoot in the park on broadway?
multipleQAs	[{'question': 'Based on the initial thoughts o...	NaN	When did the manhattan project began and end?
...	...	...	...	...
multipleQAs	[{'question': 'Who is the band at Club Skunk i...	NaN	Who is the band in the movie 10 things i hate ...
multipleQAs	[{'question': 'Who is the band that performs a...	NaN	Who is the band in the movie 10 things i hate ...
singleAnswer	NaN	[Gwynne Evans and Peter Allen]	Who was the last person in the uk to be executed?
multipleQAs	[{'question': 'Who does wonder woman end up wi...	NaN	Who does wonder woman end up with in the comics?
multipleQAs	[{'question': 'When were the first pair of Air...	NaN	When were the first pair of jordans released?
rows × 4 columns

Procedure to create question-answer pair:

1) The data was divided into 2 portions:

1) type = ‘singleAnswer’ : for this portion question column was taken and answer was taken from annotations column

	question						answer
Who played george washington in the john adams...	David Morse
When did the frozen ride open at epcot?			June 21, 2016
Name the landforms that form the boundaries of...	Aravali Range, Satpura Range, Vindhyan Range
When was the first airplane used in war?	Blériot XI
When was the first airplane used in war?	Nieuport IV
...	...	...
Who played alotta fagina in austin powers movie?	Fabiana Udenio
Who played alotta fagina in austin powers movie?	Fabiana Udenio
Who played alotta fagina in austin powers movie?	Fabiana Udenio
Who wrote make you feel my love song?	Bob Dylan
Who was the last person in the uk to be executed?	Gwynne Evans and Peter Allen
rows × 2 columns

2) type = ‘multipleQAs’ : for this portion, qaPairs is converted into question and answer columns expanding over rows.

	question						answer
When did the Simpsons first air on television ...	April 19, 1987
When did the Simpsons first air as a half-hour...	December 17, 1989
What is the legal age of marriage, without par...	18 years of age
What is the legal age of marriage, without par...	18
What is the legal age of marriage, without par...	19
...	...	...
Who does wonder woman end up with in All Star ...	General Steven Rockwell Trevor
Who does wonder woman end up with in All Star ...	Steve Trevor
Who does wonder woman end up with in the new 52?	Superman and Batman
When were the first pair of Air Jordans releas...	early 1984
When were the first pair of Air Jordans releas...	November 17, 1984
rows × 2 columns

2) Then, both these portion are combined vertically.

3) Steps 1-2 are repeated for both train and test data.

Link to Github Code

Link to Colab Code

import torch
from torchtext.legacy import data
from torchtext.legacy.data import Field, BucketIterator

from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen
import json
import pandas as pd

from .SeqDataset import SeqDataset

class AmbigNqQADataset(SeqDataset):
    def __init__(self, data_path, seed, batch_size, device, split_ratio=[0.7, 0.3]):
        # super(QuoraDataset, self).__init__(data_path, seed, batch_size, device, split_ratio)
        self.split_ratio = split_ratio
        self.data_path = data_path
        self.seed = seed
        self.device = device
        self.batch_size = batch_size
        self.seq_data = self.load_data(self.data_path)

    def load_data(self, data_path):
        # download data
        resp = urlopen(data_path)
        zipfile = ZipFile(BytesIO(resp.read()))
        files = zipfile.namelist()
        for fs in files:
            if 'train' in fs:
                with zipfile.open(fs) as json_file:
                    train_json_data = json.load(json_file)
            elif 'dev' in fs:
                with zipfile.open(fs) as json_file:
                    test_json_data = json.load(json_file)
        
        # extract data
        seq_data = pd.DataFrame()
        for json_data in [train_json_data, test_json_data]:
            df = pd.json_normalize(json_data, record_path='annotations', meta=['question'])

            # for single answer
            sa_df = df[df['type']=='singleAnswer'][['question','answer']]
            sa_df = sa_df.explode('answer').reset_index(drop=True)

            # for multiple answer
            mqa_df = pd.concat([pd.DataFrame(x) for x in df[df['type']=='multipleQAs'].reset_index(drop=True)['qaPairs']], keys=df.index).reset_index(level=1, drop=True).reset_index(drop=True)
            mqa_df = mqa_df.explode('answer').reset_index(drop=True)

            if len(seq_data)==0:
                seq_data = pd.concat([sa_df,mqa_df]).reset_index(drop=True)
            else:
                seq_data = pd.concat([seq_data,sa_df,mqa_df]).reset_index(drop=True)

        seq_data.reset_index(drop=True,inplace=True)
        seq_data.columns = ['src','trg']

        return seq_data
    

2) Commonsense QA Data

Task: Answer Common sense question

Description:

CommonsenseQA is a new multiple-choice question answering dataset that requires different types of commonsense knowledge to predict the correct answers . It contains 12,102 questions with one correct answer and four distractor answers. The dataset is provided in two major training/validation/testing set splits.

There are 3 JSON files for: train, validate, test.

We will consider train and validate files because test does not contain answers.

Each line in JSON file represents record. Each record consists of:

1) answerKey:- It denotes the key or label for correct option.

2) id:- uniques question id

3) question:- It is a dictionary of:

i) question_concept - denotes the category to which question belong.

ii) choices - denotes choices among which answer lies. It is a list of dictionary containing:

a) label: can be A or B or C or D

b) text

iii) stem

Below is the structure when converted to Data Frame:

    answerKey	id			      question.question_concept	question.choices				question.stem
A	075e483d21c29a511267ef62bedc0461	punishing	[{'label': 'A', 'text': 'ignore'}, {'label': '...	The sanctions against the school were a punish...
B	61fe6e879ff18686d7552425a36344c8	people		[{'label': 'A', 'text': 'race track'}, {'label...	Sammy wanted to go to where the people were. ...
A	4c1cb0e95b99f72d55c068ba0255c54d	choker		[{'label': 'A', 'text': 'jewelry store'}, {'la...	To locate a choker not located in a jewelry bo...
D	02e821a3e53cb320790950aab4489e85	highway		[{'label': 'A', 'text': 'united states'}, {'la...	Google Maps and other highway and street GPS s...
C	23505889b94e880c3e89cff4ba119860	fox		[{'label': 'A', 'text': 'pretty flowers.'}, {'...	The fox walked from the city into the forest, ...
...	...	...	...	...	...
E	f1b2a30a1facff543e055231c5f90dd0	going public	[{'label': 'A', 'text': 'consequences'}, {'lab...	What would someone need to do if he or she wan...
D	a63b4d0c0b34d6e5f5ce7b2c2c08b825	chair		[{'label': 'A', 'text': 'stadium'}, {'label': ...	Where might you find a chair at an office?
A	22d0eea15e10be56024fd00bb0e4f72f	jeans		[{'label': 'A', 'text': 'shopping mall'}, {'la...	Where would you buy jeans in a place with a la...
A	7c55160a4630de9690eb328b57a18dc2	well		[{'label': 'A', 'text': 'fairytale'}, {'label'...	John fell down the well. he couldn't believe ...
C	dd640927f9920930501fb8dc3efc196b	electricity	[{'label': 'A', 'text': 'put in to the water'}...	I forgot to pay the electricity bill, now what...
rows × 5 columns

To create answer column, answerKey is matched with question.choices.

   answerKey	id				question.question_concept	question.choices				question.stem	answer
A	075e483d21c29a511267ef62bedc0461	punishing	[{'label': 'A', 'text': 'ignore'}, {'label': '...	The sanctions against the school were a punish...	ignore
B	61fe6e879ff18686d7552425a36344c8	people		[{'label': 'A', 'text': 'race track'}, {'label...	Sammy wanted to go to where the people were. ...	populated areas
A	4c1cb0e95b99f72d55c068ba0255c54d	choker		[{'label': 'A', 'text': 'jewelry store'}, {'la...	To locate a choker not located in a jewelry bo...	jewelry store
D	02e821a3e53cb320790950aab4489e85	highway		[{'label': 'A', 'text': 'united states'}, {'la...	Google Maps and other highway and street GPS s...	atlas
C	23505889b94e880c3e89cff4ba119860	fox		[{'label': 'A', 'text': 'pretty flowers.'}, {'...	The fox walked from the city into the forest, ...	natural habitat
...	...	...	...	...	...	...
E	f1b2a30a1facff543e055231c5f90dd0	going public	[{'label': 'A', 'text': 'consequences'}, {'lab...	What would someone need to do if he or she wan...	telling all
D	a63b4d0c0b34d6e5f5ce7b2c2c08b825	chair		[{'label': 'A', 'text': 'stadium'}, {'label': ...	Where might you find a chair at an office?	cubicle
A	22d0eea15e10be56024fd00bb0e4f72f	jeans		[{'label': 'A', 'text': 'shopping mall'}, {'la...	Where would you buy jeans in a place with a la...	shopping mall
A	7c55160a4630de9690eb328b57a18dc2	well		[{'label': 'A', 'text': 'fairytale'}, {'label'...	John fell down the well. he couldn't believe ...	fairytale
C	dd640927f9920930501fb8dc3efc196b	electricity	[{'label': 'A', 'text': 'put in to the water'}...	I forgot to pay the electricity bill, now what...	produce heat
rows × 6 columns

From above question.stem is selected as question column and answer column are selected.

Link to Github Code

Link to Colab Code

import torch
from torchtext.legacy import data
from torchtext.legacy.data import Field, BucketIterator

from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen
import json
import pandas as pd

from .SeqDataset import SeqDataset

class CommonsenseQADataset(SeqDataset):
    def __init__(self, data_path, seed, batch_size, device, split_ratio=[0.7, 0.3]):
        # super(QuoraDataset, self).__init__(data_path, seed, batch_size, device, split_ratio)
        self.split_ratio = split_ratio
        self.data_path = data_path
        self.seed = seed
        self.device = device
        self.batch_size = batch_size
        self.seq_data = self.load_data(self.data_path)

    def load_data(self, data_path):
        # download data
        if not isinstance(data_path, list):
            data_path = [data_path]
        
        # extract data
        seq_data = pd.DataFrame()
        for url in data_path:
            resp = urlopen(url).read().decode()
            data = pd.read_json(resp,lines=True)
            df = pd.json_normalize(data.to_dict(orient='records'))

            df['trg'] = df.apply(lambda r: [x for x in r['question.choices'] if x['label']==r['answerKey']][0]['text'], axis=1)
            df = df[['question.stem','trg']]

            seq_data = pd.concat([seq_data,df]).reset_index(drop=True)

        seq_data.reset_index(drop=True,inplace=True)
        seq_data.columns = ['src','trg']

        return seq_data