tensor ( attention_masks ) return input_ids, attention_masks tensor ( input_ids ) attention_masks = torch. get ( 'attention_mask' )) # Convert lists to tensors input_ids = torch. encode_plus ( text = text_preprocessing ( sent ), # Preprocess sentence add_special_tokens = True, # Add `` and `` max_length = MAX_LEN, # Max length to truncate/pad pad_to_max_length = True, # Pad sentence to max length #return_tensors='pt', # Return PyTorch tensor return_attention_mask = True # Return attention mask ) # Add the outputs to the lists input_ids. for sent in data : # `encode_plus` will: # (1) Tokenize the sentence # (2) Add the `` and `` token to the start and end # (3) Truncate/Pad sentence to max length # (4) Map tokens to their IDs # (5) Create attention mask # (6) Return a dictionary of outputs encoded_sent = tokenizer. """ # Create empty lists to store outputs input_ids = attention_masks = # For every sentence. attention_masks (torch.Tensor): Tensor of indices specifying which tokens should be attended to by the model. input_ids (torch.Tensor): Tensor of token ids to be fed to a model. data (np.array): Array of texts to be processed. from_pretrained ( 'bert-base-uncased', do_lower_case = True ) # Create a function to tokenize a set of texts def preprocessing_for_bert ( data ): """Perform required preprocessing steps for pretrained BERT.
0 Comments
Leave a Reply. |
AuthorWrite something about yourself. No need to be fancy, just an overview. ArchivesCategories |