How to handle Multi Label DataSet from Directory for image captioning in PyTorch

I need a help in PyTorch,
Regarding Dataloader, and dataset
Can someone aid/guide me

Here is my query :
I am trying for Image Captioning using https://github.com/yunjey/pytorch-tutorial/tree/master/tutorials/03-advanced/image_captioning.

Here they have used Standard COCO Dataset.

I have dataset as images/ and captions/ directory .

Example

Directory Structure:

images/T001.jpg 

images/T002.jpg 

...

...

captions/T001.txt

captions/T002.txt

....

....

The above is the relation. Caption file has 'n' number of captions in each separate line.

I am able to create a custom Dataset class, in that the complete caption file content is being returned. But I want only one line alone gas to be returned.

Any guidance/suggestion on how to achieving this.

++++++++++++++++++++++++++++++++++++++++++++++++
Here is the class that i have designed:

from __future__ import print_function

import torch

from torchvision import datasets, models, transforms

from torchvision import transforms

from torch.autograd import Variable

from torch.nn.utils.rnn import pack_padded_sequence

import torch.optim as optim

import torch.nn as nn

#from torch import np

import numpy as np

import utils_c

from data_loader_c import get_cust_data_loader 

from models import CNN, RNN

from vocab_custom import Vocabulary, load_vocab

import os



class ImageCaptionDataSet(data.Dataset):

    def __init__(self, path, json, vocab=None, transform=None):

        self.vocab = vocab 

        self.transform = transform

        self.img_dir_path = path  

        self.cap_dir_path = json 

        self.all_imgs_path = glob.glob(os.path.join(self.img_dir_path,'*.jpg'))

        self.all_caps_path = glob.glob(os.path.join(self.cap_dir_path,'*.txt'))

        pass



    def __getitem__(self,index):

        vocab = self.vocab



        img_path = self.all_imgs_path[index]

        img_base_name = os.path.basename(img_path) 

        cap_base_name = img_base_name.replace(".jpg",".txt")

        cap_path  = os.path.join(self.cap_dir_path,cap_base_name)



        caption_all_for_a_image = open(cap_path).read().split("n")



        image = Image.open(img_path)

        image = image.convert('RGB')



        if self.transform != None:

            # apply image preprocessing

            image = self.transform(image)



        #captions_combined = 

        #max_len = 0  

        #for caption in caption_all_for_a_image:

        #    caption_str = str(caption).lower()

        #    tokens = nltk.tokenize.word_tokenize(caption_str)

        #    m = len(tokens) + 2 

        #    if m>max_len:

        #        max_len = m 

        #    caption = torch.Tensor([vocab(vocab.start_token())] +

        #                           [vocab(token) for token in tokens] +

        #                           [vocab(vocab.end_token())])

        #    captions_combined.append(caption) 

        #    #yield image, caption

        #return image,torch.Tensor(captions_combined)



        caption_str = str(caption_all_for_a_image).lower()

        tokens = nltk.tokenize.word_tokenize(caption_str)

        caption = torch.Tensor([vocab(vocab.start_token())] +

                                   [vocab(token) for token in tokens] +

                                   [vocab(vocab.end_token())])



        return image,caption



    def __len__(self):

        return len(self.all_imgs_path)

+++++++++++++++++++++++++++++++++

asked Nov 23 '18 at 7:43

rajeshkumargp

2516

which of the lines do you want? the first? last? a random one?

– Shai
Nov 23 '18 at 9:33

Assume Image001 has 5 captions ie. 5 lines of text. I want 5 times the return has to be executed. ie. Image001 - line 1 Image002 - line 2 like that.

– rajeshkumargp
Nov 24 '18 at 5:00

add a comment |

I need a help in PyTorch,
Regarding Dataloader, and dataset
Can someone aid/guide me

Here is my query :
I am trying for Image Captioning using https://github.com/yunjey/pytorch-tutorial/tree/master/tutorials/03-advanced/image_captioning.

Here they have used Standard COCO Dataset.

I have dataset as images/ and captions/ directory .

Example

Directory Structure:

images/T001.jpg 

images/T002.jpg 

...

...

captions/T001.txt

captions/T002.txt

....

....

The above is the relation. Caption file has 'n' number of captions in each separate line.

I am able to create a custom Dataset class, in that the complete caption file content is being returned. But I want only one line alone gas to be returned.

Any guidance/suggestion on how to achieving this.

++++++++++++++++++++++++++++++++++++++++++++++++
Here is the class that i have designed:

from __future__ import print_function

import torch

from torchvision import datasets, models, transforms

from torchvision import transforms

from torch.autograd import Variable

from torch.nn.utils.rnn import pack_padded_sequence

import torch.optim as optim

import torch.nn as nn

#from torch import np

import numpy as np

import utils_c

from data_loader_c import get_cust_data_loader 

from models import CNN, RNN

from vocab_custom import Vocabulary, load_vocab

import os



class ImageCaptionDataSet(data.Dataset):

    def __init__(self, path, json, vocab=None, transform=None):

        self.vocab = vocab 

        self.transform = transform

        self.img_dir_path = path  

        self.cap_dir_path = json 

        self.all_imgs_path = glob.glob(os.path.join(self.img_dir_path,'*.jpg'))

        self.all_caps_path = glob.glob(os.path.join(self.cap_dir_path,'*.txt'))

        pass



    def __getitem__(self,index):

        vocab = self.vocab



        img_path = self.all_imgs_path[index]

        img_base_name = os.path.basename(img_path) 

        cap_base_name = img_base_name.replace(".jpg",".txt")

        cap_path  = os.path.join(self.cap_dir_path,cap_base_name)



        caption_all_for_a_image = open(cap_path).read().split("n")



        image = Image.open(img_path)

        image = image.convert('RGB')



        if self.transform != None:

            # apply image preprocessing

            image = self.transform(image)



        #captions_combined = 

        #max_len = 0  

        #for caption in caption_all_for_a_image:

        #    caption_str = str(caption).lower()

        #    tokens = nltk.tokenize.word_tokenize(caption_str)

        #    m = len(tokens) + 2 

        #    if m>max_len:

        #        max_len = m 

        #    caption = torch.Tensor([vocab(vocab.start_token())] +

        #                           [vocab(token) for token in tokens] +

        #                           [vocab(vocab.end_token())])

        #    captions_combined.append(caption) 

        #    #yield image, caption

        #return image,torch.Tensor(captions_combined)



        caption_str = str(caption_all_for_a_image).lower()

        tokens = nltk.tokenize.word_tokenize(caption_str)

        caption = torch.Tensor([vocab(vocab.start_token())] +

                                   [vocab(token) for token in tokens] +

                                   [vocab(vocab.end_token())])



        return image,caption



    def __len__(self):

        return len(self.all_imgs_path)

+++++++++++++++++++++++++++++++++

asked Nov 23 '18 at 7:43

rajeshkumargp

2516

which of the lines do you want? the first? last? a random one?

– Shai
Nov 23 '18 at 9:33

Assume Image001 has 5 captions ie. 5 lines of text. I want 5 times the return has to be executed. ie. Image001 - line 1 Image002 - line 2 like that.

– rajeshkumargp
Nov 24 '18 at 5:00

add a comment |

I need a help in PyTorch,
Regarding Dataloader, and dataset
Can someone aid/guide me

Here is my query :
I am trying for Image Captioning using https://github.com/yunjey/pytorch-tutorial/tree/master/tutorials/03-advanced/image_captioning.

Here they have used Standard COCO Dataset.

I have dataset as images/ and captions/ directory .

Example

Directory Structure:

images/T001.jpg 

images/T002.jpg 

...

...

captions/T001.txt

captions/T002.txt

....

....

The above is the relation. Caption file has 'n' number of captions in each separate line.

I am able to create a custom Dataset class, in that the complete caption file content is being returned. But I want only one line alone gas to be returned.

Any guidance/suggestion on how to achieving this.

++++++++++++++++++++++++++++++++++++++++++++++++
Here is the class that i have designed:

from __future__ import print_function

import torch

from torchvision import datasets, models, transforms

from torchvision import transforms

from torch.autograd import Variable

from torch.nn.utils.rnn import pack_padded_sequence

import torch.optim as optim

import torch.nn as nn

#from torch import np

import numpy as np

import utils_c

from data_loader_c import get_cust_data_loader 

from models import CNN, RNN

from vocab_custom import Vocabulary, load_vocab

import os



class ImageCaptionDataSet(data.Dataset):

    def __init__(self, path, json, vocab=None, transform=None):

        self.vocab = vocab 

        self.transform = transform

        self.img_dir_path = path  

        self.cap_dir_path = json 

        self.all_imgs_path = glob.glob(os.path.join(self.img_dir_path,'*.jpg'))

        self.all_caps_path = glob.glob(os.path.join(self.cap_dir_path,'*.txt'))

        pass



    def __getitem__(self,index):

        vocab = self.vocab



        img_path = self.all_imgs_path[index]

        img_base_name = os.path.basename(img_path) 

        cap_base_name = img_base_name.replace(".jpg",".txt")

        cap_path  = os.path.join(self.cap_dir_path,cap_base_name)



        caption_all_for_a_image = open(cap_path).read().split("n")



        image = Image.open(img_path)

        image = image.convert('RGB')



        if self.transform != None:

            # apply image preprocessing

            image = self.transform(image)



        #captions_combined = 

        #max_len = 0  

        #for caption in caption_all_for_a_image:

        #    caption_str = str(caption).lower()

        #    tokens = nltk.tokenize.word_tokenize(caption_str)

        #    m = len(tokens) + 2 

        #    if m>max_len:

        #        max_len = m 

        #    caption = torch.Tensor([vocab(vocab.start_token())] +

        #                           [vocab(token) for token in tokens] +

        #                           [vocab(vocab.end_token())])

        #    captions_combined.append(caption) 

        #    #yield image, caption

        #return image,torch.Tensor(captions_combined)



        caption_str = str(caption_all_for_a_image).lower()

        tokens = nltk.tokenize.word_tokenize(caption_str)

        caption = torch.Tensor([vocab(vocab.start_token())] +

                                   [vocab(token) for token in tokens] +

                                   [vocab(vocab.end_token())])



        return image,caption



    def __len__(self):

        return len(self.all_imgs_path)

+++++++++++++++++++++++++++++++++

asked Nov 23 '18 at 7:43

rajeshkumargp

2516

I need a help in PyTorch,
Regarding Dataloader, and dataset
Can someone aid/guide me

Here is my query :
I am trying for Image Captioning using https://github.com/yunjey/pytorch-tutorial/tree/master/tutorials/03-advanced/image_captioning.

Here they have used Standard COCO Dataset.

I have dataset as images/ and captions/ directory .

Example

Directory Structure:

images/T001.jpg 

images/T002.jpg 

...

...

captions/T001.txt

captions/T002.txt

....

....

The above is the relation. Caption file has 'n' number of captions in each separate line.

I am able to create a custom Dataset class, in that the complete caption file content is being returned. But I want only one line alone gas to be returned.

Any guidance/suggestion on how to achieving this.

++++++++++++++++++++++++++++++++++++++++++++++++
Here is the class that i have designed:

from __future__ import print_function

import torch

from torchvision import datasets, models, transforms

from torchvision import transforms

from torch.autograd import Variable

from torch.nn.utils.rnn import pack_padded_sequence

import torch.optim as optim

import torch.nn as nn

#from torch import np

import numpy as np

import utils_c

from data_loader_c import get_cust_data_loader 

from models import CNN, RNN

from vocab_custom import Vocabulary, load_vocab

import os



class ImageCaptionDataSet(data.Dataset):

    def __init__(self, path, json, vocab=None, transform=None):

        self.vocab = vocab 

        self.transform = transform

        self.img_dir_path = path  

        self.cap_dir_path = json 

        self.all_imgs_path = glob.glob(os.path.join(self.img_dir_path,'*.jpg'))

        self.all_caps_path = glob.glob(os.path.join(self.cap_dir_path,'*.txt'))

        pass



    def __getitem__(self,index):

        vocab = self.vocab



        img_path = self.all_imgs_path[index]

        img_base_name = os.path.basename(img_path) 

        cap_base_name = img_base_name.replace(".jpg",".txt")

        cap_path  = os.path.join(self.cap_dir_path,cap_base_name)



        caption_all_for_a_image = open(cap_path).read().split("n")



        image = Image.open(img_path)

        image = image.convert('RGB')



        if self.transform != None:

            # apply image preprocessing

            image = self.transform(image)



        #captions_combined = 

        #max_len = 0  

        #for caption in caption_all_for_a_image:

        #    caption_str = str(caption).lower()

        #    tokens = nltk.tokenize.word_tokenize(caption_str)

        #    m = len(tokens) + 2 

        #    if m>max_len:

        #        max_len = m 

        #    caption = torch.Tensor([vocab(vocab.start_token())] +

        #                           [vocab(token) for token in tokens] +

        #                           [vocab(vocab.end_token())])

        #    captions_combined.append(caption) 

        #    #yield image, caption

        #return image,torch.Tensor(captions_combined)



        caption_str = str(caption_all_for_a_image).lower()

        tokens = nltk.tokenize.word_tokenize(caption_str)

        caption = torch.Tensor([vocab(vocab.start_token())] +

                                   [vocab(token) for token in tokens] +

                                   [vocab(vocab.end_token())])



        return image,caption



    def __len__(self):

        return len(self.all_imgs_path)

+++++++++++++++++++++++++++++++++

python pytorch

asked Nov 23 '18 at 7:43

rajeshkumargp

2516

asked Nov 23 '18 at 7:43

rajeshkumargp

2516

asked Nov 23 '18 at 7:43

rajeshkumargp

2516

asked Nov 23 '18 at 7:43

rajeshkumargp

2516

asked Nov 23 '18 at 7:43

rajeshkumargp

2516

which of the lines do you want? the first? last? a random one?

– Shai
Nov 23 '18 at 9:33

Assume Image001 has 5 captions ie. 5 lines of text. I want 5 times the return has to be executed. ie. Image001 - line 1 Image002 - line 2 like that.

– rajeshkumargp
Nov 24 '18 at 5:00

add a comment |

which of the lines do you want? the first? last? a random one?

– Shai
Nov 23 '18 at 9:33

Assume Image001 has 5 captions ie. 5 lines of text. I want 5 times the return has to be executed. ie. Image001 - line 1 Image002 - line 2 like that.

– rajeshkumargp
Nov 24 '18 at 5:00

which of the lines do you want? the first? last? a random one?

– Shai
Nov 23 '18 at 9:33

Assume Image001 has 5 captions ie. 5 lines of text. I want 5 times the return has to be executed. ie. Image001 - line 1 Image002 - line 2 like that.

– rajeshkumargp
Nov 24 '18 at 5:00

add a comment |

1 Answer
1

active

oldest

votes

First, using str() to convert the list of captions into a single string (caption_str = str(caption_all_for_a_image)) is a bad idea:

cap = ['a sentence', 'bla bla bla']

str(cap)

Returns this sting:

"['a sentence', 'bla bla bla']"

Note that [', and ', ' are part of the resulting string!

You can pick one of the captions at random:

import random

...

cap_idx = random.randi(0, len(caption_all_for_a_image)-1)  # pick one at random

caption_str = caption_all_for_a_image[cap_idx].lower()  # actual selection

answered Nov 23 '18 at 9:51

Shai

70.7k23138247

add a comment |

Your Answer

StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");

StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});

}
});

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53442510%2fhow-to-handle-multi-label-dataset-from-directory-for-image-captioning-in-pytorch%23new-answer', 'question_page');
}
);

Post as a guest

Name

Required, but never shown

1 Answer
1

active

oldest

votes

1 Answer
1

active

oldest

votes

First, using str() to convert the list of captions into a single string (caption_str = str(caption_all_for_a_image)) is a bad idea:

cap = ['a sentence', 'bla bla bla']

str(cap)

Returns this sting:

"['a sentence', 'bla bla bla']"

Note that [', and ', ' are part of the resulting string!

You can pick one of the captions at random:

import random

...

cap_idx = random.randi(0, len(caption_all_for_a_image)-1)  # pick one at random

caption_str = caption_all_for_a_image[cap_idx].lower()  # actual selection

answered Nov 23 '18 at 9:51

Shai

70.7k23138247

add a comment |

First, using str() to convert the list of captions into a single string (caption_str = str(caption_all_for_a_image)) is a bad idea:

cap = ['a sentence', 'bla bla bla']

str(cap)

Returns this sting:

"['a sentence', 'bla bla bla']"

Note that [', and ', ' are part of the resulting string!

You can pick one of the captions at random:

import random

...

cap_idx = random.randi(0, len(caption_all_for_a_image)-1)  # pick one at random

caption_str = caption_all_for_a_image[cap_idx].lower()  # actual selection

answered Nov 23 '18 at 9:51

Shai

70.7k23138247

add a comment |

First, using str() to convert the list of captions into a single string (caption_str = str(caption_all_for_a_image)) is a bad idea:

cap = ['a sentence', 'bla bla bla']

str(cap)

Returns this sting:

"['a sentence', 'bla bla bla']"

Note that [', and ', ' are part of the resulting string!

You can pick one of the captions at random:

import random

...

cap_idx = random.randi(0, len(caption_all_for_a_image)-1)  # pick one at random

caption_str = caption_all_for_a_image[cap_idx].lower()  # actual selection

answered Nov 23 '18 at 9:51

Shai

70.7k23138247

First, using str() to convert the list of captions into a single string (caption_str = str(caption_all_for_a_image)) is a bad idea:

cap = ['a sentence', 'bla bla bla']

str(cap)

Returns this sting:

"['a sentence', 'bla bla bla']"

Note that [', and ', ' are part of the resulting string!

You can pick one of the captions at random:

import random

...

cap_idx = random.randi(0, len(caption_all_for_a_image)-1)  # pick one at random

caption_str = caption_all_for_a_image[cap_idx].lower()  # actual selection

answered Nov 23 '18 at 9:51

Shai

70.7k23138247

answered Nov 23 '18 at 9:51

Shai

70.7k23138247

answered Nov 23 '18 at 9:51

Shai

70.7k23138247

answered Nov 23 '18 at 9:51

Shai

70.7k23138247

add a comment |

draft saved

draft discarded

Thanks for contributing an answer to Stack Overflow!

Please be sure to answer the question. Provide details and share your research!

But avoid …

Asking for help, clarification, or responding to other answers.

Making statements based on opinion; back them up with references or personal experience.

To learn more, see our tips on writing great answers.

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Name

Required, but never shown

Name

Required, but never shown

This page is only for reference, If you need detailed information, please check here

搜尋此網誌

Wsrtjtyk