/* The website is built from the list of pages/tasks in src/tasks.ts
 * This list is used to build the menu and defines what components and properties to use for each
 * task.
 *
 * Tasks define all the info needed to render and run the models of the task.  For example, a task
 * in tasks.ts defines the set of example images that will work best for that task and the set of
 * models that can be used to solve that task.
 *
 * New models for existing tasks are very easy to add.  To add a new model, simply find the right
 * task, copy an existing model in that task, and edit the fields as needed.
 * If the model you want to add cannot be solved by existing tasks, you will likely need to make a
 * new task and new API model.
 *
 * The API is comprised of a simple fetch in api/api.ts and a number of Answer interfaces.
 *
 * Answer is a simple interface that all API answers implement.
 * eg, CaptioningAnswer is an Answer
 *
 * Individual Answers (like CaptioningAnswer) have many forms and require unique down stream code to
 * utilize them.
 * Many tasks may use the same Answer.  But they must all return the same Answer interface/form.
 *
 * Models in tasks.ts define the AnswerInfo that is used to display the results.
 * AnswerInfo has a AnswerInfoProps with a generic type of Answer
 * eg, CaptioningAnswerInfo is an AnswerInfo with AnswerInfoProps<CaptioningAnswer>
 *
 * AnswerInfo holds the question and the answer
 * CaptioningAnswerInfo is responsible for rendering the answer found in the AnswerInfo
 *
 * AnswerPage collects an image and other parameters from the user and passes them to a
 * component via an AnswerInfo.
 *
 * AnswerPage is of generic type <A(nswer),P(rops)> where Answer holds props of type A and
 * ParamsControls holds props of the type P.
 *
 * The page used for captioning answer is a AnswerPage<CaptioningAnswer, BaseImageModelParams> with
 * props = Props<CaptioningAnswer, BaseImageModelParams>.
 * This allows the props to contain the appropriate ParamsControl and AnswerInfo.
 * AnswerPage<A,P> has Props<A,P>, which means it holds AnswerInfo<A> and ModelProps<P>.
 *
 * AnswerPage specifies additional api parameters via ModelParams passed to the ParamsControl
 * passed to it.  eg, VisualQuestionAnswer passes VisualQuestionModelParamControl (which has props
 * of VisualQuestionParams) to the AnswerPage.  This allows additional parameters to be
 * collected on a per task level.
 */

import * as React from 'react';
import { Popover, Tag } from '@allenai/varnish';

import {
    CaptioningAnswerInfo,
    ClassificationAnswerInfo,
    DepthEstimationAnswerInfo,
    GroundedSituationRecognitionAnswerInfo,
    ImageGenerationAnswerInfo,
    ImageGenerationModelParamControl,
    ImageGenerationParams,
    ImageModelParams,
    ImageParamControl,
    ObjectDetectionAnswerInfo,
    PoseEstimationAnswerInfo,
    SegmentationAnswerInfo,
    SituationRecognitionAnswerInfo,
    SurfaceNormalEstimationAnswerInfo,
    VisualQuestionAnswerInfo,
    VisualQuestionBboxModelParamControl,
    VisualQuestionModelParamControl,
    VisualQuestionParams,
    ModelParamLabel,
    VisualQuestionBboxParams,
    allImageCrop,
} from './components';
import { AnswerPage, AnswerPageProps } from './pages';
import {
    BaseQuery,
    baseRequestFrom,
    CaptioningAnswer,
    ClassificationAnswer,
    DepthEstimationAnswer,
    GroundedSituationRecognitionAnswer,
    ImageGenerationAnswer,
    ObjectDetectionAnswer,
    PoseEstimationAnswer,
    SegmentationAnswer,
    SituationRecognitionAnswer,
    SurfaceNormalEstimationAnswer,
    VisualQuestionAnswer,
} from './api';
import { AppRoute, AppRouteGroup } from './AppRoute';
import { Dictionary } from './utils';

// menu icons
import poseIcon from './icons/pose-14px.svg';
import surfaceNormalsIcon from './icons/normals-14px.svg';
import visionIcon from './icons/vision-14px.svg';
import multiTaskIcon from './icons/multitask.svg';
import recognitionIcon from './icons/recognition-14px.svg';

// example images
import baseballGameSrc from './examples/baseball_game.jpg';
import baseballPitchingMotionSrc from './examples/baseball_pitching_motion.jpg';
import birdSingingSrc from './examples/bird_singing.jpg';
import birdsSrc from './examples/birds.jpg';
import buildingsSrc from './examples/buildings.jpg';
import busStopSrc from './examples/bus_stop.jpg';
import campingSrc from './examples/camping.jpg';
import city from './examples/city.png';
import cityStreetSrc from './examples/city_street.jpg';
import cookingSrc from './examples/cooking.jpg';
import desertRoadSrc from './examples/desert_road.jpg';
import diningRoomSrc from './examples/dining_room.jpg';
import doctorSrc from './examples/doctor.jpg';
import dragonBallZCrowdSrc from './examples/dragon_ball_z_crowd.jpg';
import drawingSrc from './examples/drawing.jpg';
import exerciseSrc from './examples/exercise.jpg';
import hallwaySrc from './examples/hallway.jpg';
import humanWildlifeConflictSrc from './examples/human_wildlife_conflict.jpg';
import islandRoadSrc from './examples/island_road.jpg';
import kidsDrawingSrc from './examples/kids_drawing.jpg';
import kidsSoccerSrc from './examples/kids_soccer.jpg';
import kitchenSrc from './examples/kitchen.jpg';
import landscapeSrc from './examples/landscape.jpg';
import lawnSrc from './examples/lawn.jpg';
import livingRoomSrc from './examples/living_room.jpg';
import londonStreetSrc from './examples/london_street.jpg';
import mountainSrc from './examples/mountain.jpg';
import octopusSrc from './examples/octopus.jpg';
import pagudpudBeachSrc from './examples/pagudpud_beach.jpg';
import skydiverSrc from './examples/skydiver.jpg';
import soccerGameSrc from './examples/soccer_game.jpg';
import studentsSrc from './examples/students.jpg';
import summerStreetSrc from './examples/summer_street.jpg';
import swiftTrawlerSrc from './examples/swift_trawler.jpg';
import teacherSrc from './examples/teacher.jpg';
import treeHouseSrc from './examples/tree_house.jpg';
import yellowLabradorSrc from './examples/yellow_labrador.jpg';
import peripicalStraumannBl from './examples/periapical_straumann_bl.jpg';
import peripicalNobelReplace from './examples/periapical_nobel_replace.jpg';
import peripicalStraumannWn from './examples/periapical_straumann_wn.jpg';

import sari from './examples/sari.jpg';
import sewingMachine from './examples/sewing_machine.jpg';
import jaguarYawning from './examples/jaguar_yawning.jpg';
import stuffedMonkey from './examples/stuffed_monkey.jpg';
import beach from './examples/beach.jpg';

import {
    GeneralPurposeLanguageAnswerInfo,
    GPVProps,
} from './components/GeneralPurposeLanguageAnswerInfo';

export interface LinkInfo {
    label: string;
    url: string;
}

export interface Attribution {
    paper: LinkInfo;
    authors: string[];
    publication: string;
    publicationYear: number;
    repo?: LinkInfo;
}

export const tasks: Dictionary<AppRoute> = {
    classification: {
        path: '/classification',
        label: 'Classification',
        component: AnswerPage,
        componentProps: {
            title: 'Classification',
            markdownDescription: `
Image classification is the task of assigning an input image, a single label drawn from a fixed set
of categories. Image classification models are trained and evaluated on large classification
datasets such as ImageNet that has 1000 image categories.`,
            examples: [
                { label: 'Yellow Labrador', img1Src: yellowLabradorSrc },
                { label: 'Bird Singing', img1Src: birdSingingSrc },
            ],
            modelParamComponent: ImageParamControl,
            modelParamComponentUiParams: { getInputLabel: () => null },
            models: [
                {
                    name: 'ResNet50',
                    markdownDescription: `
A very popular model that is often used as a backbone CNN to extract visual representations. It
achieves a Top 1 accuracy of 76.1 on ImageNet (1000 categories).`,
                    attribution: {
                        paper: {
                            label: 'Deep Residual Learning for Image Recognition',
                            url: 'https://www.semanticscholar.org/paper/061356704ec86334dbbc073985375fe13cd39088',
                        },
                        publication: 'ICLR',
                        publicationYear: 2016,
                        authors: ['Kaiming He', 'Xiangyu Zhang', 'Shaoqing Ren', 'Jian Sun'],
                        repo: {
                            label: 'github',
                            url: 'https://github.com/pytorch/vision',
                        },
                    },
                    answerInfo: ClassificationAnswerInfo,
                    apiCall: (q: BaseQuery) => baseRequestFrom(q, `api/resnet50`),
                    estimatedApiDuration: 2100,
                },
                {
                    name: 'ResNeXt-101-32x8d',
                    markdownDescription: `
Achieves a very impressive Top 1 accuracy of 79.3 on ImageNet (1000 categories).`,
                    attribution: {
                        paper: {
                            label: 'Aggregated Residual Transformations for Deep Neural Networks',
                            url: 'https://www.semanticscholar.org/paper/f6e0856b4a9199fa968ac00da612a9407b5cb85c',
                        },
                        publication: 'CVPR',
                        publicationYear: 2017,
                        authors: [
                            'Saining Xie',
                            'Ross B. Girshick',
                            'Piotr Dollár',
                            'Zhuowen Tu',
                            'Kaiming He',
                        ],
                        repo: {
                            label: 'github',
                            url: 'https://github.com/pytorch/vision',
                        },
                    },
                    answerInfo: ClassificationAnswerInfo,
                    apiCall: (q: BaseQuery) => baseRequestFrom(q, `api/resnext101`),
                    estimatedApiDuration: 2100,
                },
                {
                    name: 'MobileNet V2',
                    markdownDescription: `
A lean mobile network that achieves a Top 1 accuracy of 72.0 on ImageNet (1000 categories) with just
3.4 Million parameters`,
                    attribution: {
                        paper: {
                            label: 'MobileNetV2: Inverted Residuals and Linear Bottlenecks',
                            url: 'https://www.semanticscholar.org/paper/dd9cfe7124c734f5a6fc90227d541d3dbcd72ba4',
                        },
                        publication: 'CVPR',
                        publicationYear: 2018,
                        authors: [
                            'Mark Sandler',
                            'Andrew Howard',
                            'Menglong Zhu',
                            'Andrey Zhmoginov',
                            'Liang-Chieh Chen',
                        ],
                        repo: {
                            label: 'github',
                            url: 'https://github.com/pytorch/vision',
                        },
                    },
                    answerInfo: ClassificationAnswerInfo,
                    apiCall: (q: BaseQuery) => baseRequestFrom(q, `api/mobilenet-v2`),
                    estimatedApiDuration: 2100,
                },
            ],
        } as AnswerPageProps<ClassificationAnswer, ImageModelParams>,
    },

    objectDetection: {
        path: '/detection',
        label: 'Detection',
        component: AnswerPage,
        componentProps: {
            title: 'Detection',
            markdownDescription: `
Object detection is the task of identifying and locating objects in an image. Object detection
models are typically trained and evaluated on the MS-COCO dataset that has 80 object categories.`,
            examples: [
                { label: 'Baseball Game', img1Src: baseballGameSrc },
                { label: 'Bus Stop', img1Src: busStopSrc },
                { label: 'Kitchen', img1Src: kitchenSrc },
                { label: 'Living Room', img1Src: livingRoomSrc },
            ],
            modelParamComponent: ImageParamControl,
            modelParamComponentUiParams: { getInputLabel: () => null },
            models: [
                {
                    name: 'YoloV3',
                    markdownDescription: `
As the name “You Only Look Once” suggests, this is a standard single shot model. Only examining the
features of the image once allows this model to detect objects incredibly quickly. While this comes
at a small cost to accuracy, Yolo is nearly comparable in terms of accuracy to the larger two stage
detectors.`,
                    attribution: {
                        paper: {
                            label: 'YOLOv3: An Incremental Improvement',
                            url: 'https://www.semanticscholar.org/paper/e4845fb1e624965d4f036d7fd32e8dcdd2408148',
                        },
                        publication: 'arXiv',
                        publicationYear: 2018,
                        authors: ['Joseph Redmon', 'Ali Farhadi'],
                        repo: {
                            label: 'github',
                            url: 'https://github.com/ultralytics/yolov3',
                        },
                    },
                    answerInfo: ObjectDetectionAnswerInfo,
                    apiCall: (q: BaseQuery) => baseRequestFrom(q, `api/yolov3`),
                    estimatedApiDuration: 2100,
                },
                {
                    name: 'Faster R-CNN',
                    markdownDescription: `
This model is an example of a two stage detector, meaning it first selects regions which are likely
to contain objects and then locates the objects within these regions. However, unlike some of its
predecessors, Faster R-CNN is able to maintain a reasonable speed by sharing image features between
its two stages. While much slower than single shot detector like yolo, it has a slightly higher
accuracy.`,
                    attribution: {
                        paper: {
                            label: 'Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks',
                            url: 'https://www.semanticscholar.org/paper/424561d8585ff8ebce7d5d07de8dbf7aae5e7270',
                        },
                        publication: 'NeurIPS',
                        publicationYear: 2015,
                        authors: ['Shaoqing Ren', 'Kaiming He', 'Ross B. Girshick', 'Jian Sun'],
                        repo: {
                            label: 'github',
                            url: 'https://github.com/pytorch/vision',
                        },
                    },
                    answerInfo: ObjectDetectionAnswerInfo,
                    apiCall: (q: BaseQuery) => baseRequestFrom(q, `api/fasterrcnn`),
                    estimatedApiDuration: 2100,
                },
            ],
        } as AnswerPageProps<ObjectDetectionAnswer, ImageModelParams>,
    },

    segmentation: {
        path: '/segmentation',
        label: 'Segmentation',
        component: AnswerPage,
        componentProps: {
            title: 'Segmentation',
            markdownDescription: `
Image segmentation is the task of segmenting the pixels in an image into a set of object categories.
Image segmentation models are typically trained and evaluated on the PASCAL VOC segmentation
dataset.`,
            examples: [
                { label: 'Swift Trawler Boat', img1Src: swiftTrawlerSrc },
                { label: 'Soccer Game', img1Src: soccerGameSrc },
                { label: 'Summer Street', img1Src: summerStreetSrc },
            ],
            modelParamComponent: ImageParamControl,
            modelParamComponentUiParams: { getInputLabel: () => null },
            models: [
                {
                    name: 'DeepLabV3',
                    markdownDescription: `
This network employs atrous convolution filters with multiple atrous rates to capture multi-scale
context and obtains a mean Intersection over Union (mIOU) of 85.7% on the PASCAL VOC 2012 dataset.`,
                    attribution: {
                        paper: {
                            label: 'Rethinking Atrous Convolution for Semantic Image Segmentation',
                            url: 'https://www.semanticscholar.org/paper/ee4a012a4b12d11d7ab8c0e79c61e807927a163c',
                        },
                        publication: 'ECCV',
                        publicationYear: 2018,
                        authors: [
                            'Liang-Chieh Chen',
                            'George Papandreou',
                            'Florian Schroff',
                            'Hartwig Adam',
                        ],
                        repo: {
                            label: 'github',
                            url: 'https://github.com/pytorch/vision',
                        },
                    },
                    answerInfo: SegmentationAnswerInfo,
                    apiCall: (q: BaseQuery) => baseRequestFrom(q, `api/deeplabv3`),
                    estimatedApiDuration: 19000,
                },
            ],
        } as AnswerPageProps<SegmentationAnswer, ImageModelParams>,
    },

    imageGeneration: {
        path: '/text_to_image_generation',
        label: 'Image Generation',
        component: AnswerPage,
        componentProps: {
            title: 'Text-to-Image Generation',
            markdownDescription: `
Text-to-Image Generation is the task of generating an image conditioned on the input text.`,
            stepOneInstruction: 'Enter a Caption (or choose one from the examples)',
            examples: [
                {
                    label: 'Giraffe by Tree',
                    caption: 'A giraffe standing on dirt ground near a tree.',
                },
                {
                    label: 'Giraffe by Cars',
                    caption: 'A giraffe walking on a road with two cars approaching.',
                },
                {
                    label: 'People on Couch',
                    caption: 'Two people play video games while sitting on a couch.',
                },
                {
                    label: 'Kites and Trees',
                    caption: 'A grassy tree filled field with a lot of kites in the air.',
                },
                {
                    label: 'Clock Tower',
                    caption: 'A large painted clock tower in the middle of town.',
                },
                {
                    label: 'Skier on Flat Hill',
                    caption: 'A woman attempting to ski on a flat hill',
                },
                {
                    label: 'Home Office',
                    caption: 'A full view of a home office with many computer screens.',
                },
            ],
            modelParamComponent: ImageGenerationModelParamControl,
            modelParamComponentUiParams: { getInputLabel: () => null },
            models: [
                {
                    name: 'X-LXMERT',
                    markdownDescription: `
An extension to LXMERT with training refinements including: discretizing visual representations,
using uniform masking with a large range of masking ratios and aligning the right pre-training
datasets to the right objectives which enables it to paint.`,
                    attribution: {
                        paper: {
                            label: 'X-LXMERT: Paint, Caption and Answer Questions with Multi-Modal Transformers',
                            url: 'https://www.semanticscholar.org/paper/e5fb7a72807af36a7d39049346b3feb422a50c3c',
                        },
                        publication: 'EMNLP',
                        publicationYear: 2020,
                        authors: [
                            'Jaemin Cho',
                            'Jiasen Lu',
                            'Dustin Schwenk',
                            'Hannaneh Hajishirzi',
                            'Aniruddha Kembhavi',
                        ],
                        repo: {
                            label: 'github',
                            url: 'https://github.com/allenai/x-lxmert',
                        },
                    },
                    answerInfo: ImageGenerationAnswerInfo,
                    apiCall: (q: BaseQuery) => baseRequestFrom(q, `api/xlxmert`),
                    estimatedApiDuration: 10000, // TODO: reeval this number after model launch
                },
            ],
        } as AnswerPageProps<ImageGenerationAnswer, ImageGenerationParams>,
    },

    visualQuestion: {
        path: '/visual_question',
        label: 'Visual QA',
        component: AnswerPage,
        componentProps: {
            title: 'Visual Question Answering',
            markdownDescription: `
Visual Question Answering (VQA) is the task of generating a answer in response to a natural language
question about the contents of an image. VQA models are typically trained and evaluated on datasets
such as VQA2.0, GQA, Visual7W and VizWiz.`,
            stepOneInstruction:
                'Upload an Image and enter a Question (or choose from the examples)',
            examples: [
                {
                    label: 'Baseball Game',
                    img1Src: baseballGameSrc,
                    question: 'What game are they playing?',
                },
                {
                    label: 'Bus Stop',
                    img1Src: busStopSrc,
                    question: 'What are the people waiting for?',
                },
                {
                    label: 'Kitchen',
                    img1Src: kitchenSrc,
                    question: 'What is in the bowls on the island?',
                },
                {
                    label: 'Living Room',
                    img1Src: livingRoomSrc,
                    question: 'What color is the pillow in the middle?',
                },
            ],
            modelParamComponent: VisualQuestionModelParamControl,
            modelParamComponentUiParams: { getInputLabel: () => null },
            models: [
                {
                    name: 'Pythia',
                    markdownDescription: `
This is a modular re-implementation of the bottom-up top-down (up-down) model (Anderson et al) with
subtle but important changes to the model architecture and the learning rate schedule, finetuning
image features, and adding data augmentation. This model was the winning entry to the VQA Challenge
in 2018.`,
                    attribution: {
                        paper: {
                            label: 'Pythia v0.1: the Winning Entry to the VQA Challenge 2018',
                            url: 'https://www.semanticscholar.org/paper/Pythia-v0.1%3A-the-Winning-Entry-to-the-VQA-Challenge-Jiang-Natarajan/36c3972569a6949ecca90bfa6f8e99883e092845',
                        },
                        publication: 'arXiv',
                        publicationYear: 2018,
                        authors: [
                            'Yu Jiang',
                            'Vivek Natarajan',
                            'Xinlei Chen',
                            'Marcus Rohrbach',
                            'Dhruv Batra',
                            'Devi Parikh',
                        ],
                        repo: {
                            label: 'github',
                            url: 'https://github.com/facebookresearch/pythia',
                        },
                    },
                    answerInfo: VisualQuestionAnswerInfo,
                    apiCall: (q: BaseQuery) => baseRequestFrom(q, `api/pythia`),
                    estimatedApiDuration: 1900,
                },
            ],
        } as AnswerPageProps<VisualQuestionAnswer, VisualQuestionParams>,
    },

    situationRecognition: {
        path: '/situation_recognition',
        label: 'Situation Recognition',
        component: AnswerPage,
        componentProps: {
            title: 'Situation Recognition',
            markdownDescription: `
This task is to produce concise summaries of situations in images depicting: (1) the main activity,
(2) the participating actors, objects, substances, and locations and most importantly (3) the roles
these participants play in the activity.`,
            examples: [
                { label: 'Cooking at Stove', img1Src: cookingSrc },
                { label: 'Doctor with Patient', img1Src: doctorSrc },
                { label: 'Drawing a Flower', img1Src: drawingSrc },
                { label: 'Teacher', img1Src: teacherSrc },
                { label: 'Mowing the Lawn', img1Src: lawnSrc },
                { label: 'Octopus', img1Src: octopusSrc },
                { label: 'Kids Drawing', img1Src: kidsDrawingSrc },
                { label: 'Camping Site', img1Src: campingSrc },
                { label: 'Skydiver', img1Src: skydiverSrc },
                { label: 'Landscape', img1Src: landscapeSrc },
            ],
            modelParamComponent: ImageParamControl,
            modelParamComponentUiParams: { getInputLabel: () => null },
            models: [
                {
                    name: 'imSitu',
                    markdownDescription: `
This model uses a conditional random field acting on potentials calculated from a neural net. These
potentials contain information about the nouns or potential objects in the image, the role these
objects have in the scene and a representation of the entire image. The representation for the roles
are specific for the image while the representations for the nouns are global. The action for an
image is calculated directly from the image representation.`,
                    attribution: {
                        paper: {
                            label: 'Commonly Uncommon: Semantic Sparsity in Situation Recognition',
                            url: 'https://www.semanticscholar.org/paper/02239ae5e922075a354169f75f684cad8fdfd5ab',
                        },
                        publication: 'CVPR',
                        publicationYear: 2017,
                        authors: [
                            'Mark Yatskar',
                            'Vicente Ordonez',
                            'Luke S. Zettlemoyer',
                            'Ali Farhadi',
                        ],
                        repo: {
                            label: 'github',
                            url: 'https://github.com/my89/imSitu',
                        },
                    },
                    answerInfo: SituationRecognitionAnswerInfo,
                    apiCall: (q: BaseQuery) => baseRequestFrom(q, `api/imsitu`),
                    estimatedApiDuration: 1900,
                },
            ],
        } as AnswerPageProps<SituationRecognitionAnswer, ImageModelParams>,
    },

    groundedSituationRecognition: {
        path: '/grounded_situation_recognition',
        label: 'Grounded Situ Recog',
        component: AnswerPage,
        componentProps: {
            title: 'Grounded Situation Recognition',
            markdownDescription: `
Grounded Situation Recognition is the task of identifying the situation observed in the image and
also visually ground the identified roles within the corresponding image.`,
            examples: [
                { label: 'Cooking at Stove', img1Src: cookingSrc },
                { label: 'Doctor with Patient', img1Src: doctorSrc },
                { label: 'Drawing a Flower', img1Src: drawingSrc },
                { label: 'Teacher', img1Src: teacherSrc },
                { label: 'Mowing the Lawn', img1Src: lawnSrc },
                { label: 'Octopus', img1Src: octopusSrc },
                { label: 'Kids Drawing', img1Src: kidsDrawingSrc },
                { label: 'Camping Site', img1Src: campingSrc },
                { label: 'Skydiver', img1Src: skydiverSrc },
                { label: 'Landscape', img1Src: landscapeSrc },
            ],
            modelParamComponent: ImageParamControl,
            modelParamComponentUiParams: { getInputLabel: () => null },
            models: [
                {
                    name: 'Joint Situation Localizer',
                    markdownDescription: `
JSL is a method to simultaneously classify a situation and locate objects in that situation. This
allows for a role’s noun and grounding to be conditioned on the nouns and groundings of previous
roles and the verb. It also allows features to be shared and potential patterns between nouns and
positions to be exploited.`,
                    attribution: {
                        paper: {
                            label: 'Grounded Situation Recognition',
                            url: 'https://www.semanticscholar.org/paper/fc261c0efb5f9ce82581932d1440630b861fb85f',
                        },
                        publication: 'ECCV',
                        publicationYear: 2020,
                        authors: [
                            'Sarah Pratt',
                            'Mark Yatskar',
                            'Luca Weihs',
                            'Ali Farhadi',
                            'Aniruddha Kembhavi',
                        ],
                        repo: {
                            label: 'github',
                            url: 'https://github.com/allenai/swig',
                        },
                    },
                    answerInfo: GroundedSituationRecognitionAnswerInfo,
                    apiCall: (q: BaseQuery) => baseRequestFrom(q, `api/jointsituationlocalizer`),
                    estimatedApiDuration: 2400,
                },
            ],
        } as AnswerPageProps<GroundedSituationRecognitionAnswer, ImageModelParams>,
    },

    captioning: {
        path: '/image_captioning',
        label: 'Image Captioning',
        component: AnswerPage,
        componentProps: {
            title: 'Image Captioning',
            markdownDescription: `
This task is to generate textual description of a digital image. Many models are based on
sequence-to-sequence framework (CNN image encoder + RNN language model) plus attention mechanism.`,
            examples: [
                { label: 'Birds', img1Src: birdsSrc },
                {
                    label: 'Human Wildlife Conflict',
                    img1Src: humanWildlifeConflictSrc,
                },
                { label: 'Mountain Vista', img1Src: mountainSrc },
                { label: 'Pagudpud Beach', img1Src: pagudpudBeachSrc },
                { label: 'People Exercising', img1Src: exerciseSrc },
            ],
            modelParamComponent: ImageParamControl,
            modelParamComponentUiParams: { getInputLabel: () => null },
            models: [
                {
                    name: 'Bottom-Up & Top-Down (BUTD) Attention',
                    markdownDescription: `
Previous captioning models usually adopt only top-down attention to the sequence-to-sequence
framework. This model combines top-down and bottom-up design into attention mechanism: An object
detector (Faster R-CNN) proposes image regions (bottom-up), and a top-down attention module
determines feature weightings of the proposed regions. This model won the 2017 VQA challenge.`,
                    attribution: {
                        paper: {
                            label: 'Bottom-Up and Top-Down Attention for Image Captioning and VQA',
                            url: 'https://www.semanticscholar.org/paper/a79b694bd4ef51207787da1948ed473903b751ef',
                        },
                        publication: 'CVPR',
                        publicationYear: 2018,
                        authors: [
                            'Peter Anderson',
                            'Xiaodong He',
                            'Chris Buehler',
                            'Damien Teney',
                            'Mark Johnson',
                            'Stephen Gould',
                            'Lei Zhang',
                        ],
                        repo: {
                            label: 'github',
                            url: 'https://github.com/peteanderson80/bottom-up-attention',
                        },
                    },
                    answerInfo: CaptioningAnswerInfo,
                    apiCall: (q: BaseQuery) => baseRequestFrom(q, `api/butd`),
                    estimatedApiDuration: 3000,
                },
            ],
        } as AnswerPageProps<CaptioningAnswer, ImageModelParams>,
    },
    periapical: {
        path: '/periapical',
        label: 'Periapical Classification',
        hideFromMenu: true,
        component: AnswerPage,
        componentProps: {
            title: 'Periapical Classification',
            markdownDescription: '',
            examples: [
                { label: 'Straumann BL', img1Src: peripicalStraumannBl },
                { label: 'Nobel Replace', img1Src: peripicalNobelReplace },
                { label: 'Straumann WN', img1Src: peripicalStraumannWn },
            ],
            modelParamComponent: ImageParamControl,
            modelParamComponentUiParams: { getInputLabel: () => null },
            models: [
                {
                    name: 'Periapical Model',
                    markdownDescription: '',
                    answerInfo: ClassificationAnswerInfo,
                    apiCall: (q: BaseQuery) => baseRequestFrom(q, `api/periapical`),
                    estimatedApiDuration: 2100,
                },
            ],
        } as AnswerPageProps<ClassificationAnswer, ImageModelParams>,
    },
    generalPurposeLanguage: {
        path: '/general_purpose_vision',
        label: 'General Purpose Vision',
        // hideFromMenu: true,
        component: AnswerPage,
        componentProps: {
            title: 'General Purpose Vision',
            markdownDescription: `
A general purpose vision system can solve many different vision and vision+language tasks with a
 unified architecture. Tasks are performed by training the model to respond to an image and a natural
 language task description.
 For example, the question "What is this?" prompts the model to perform image classification, 
 Some systems can also use a bounding box as input as perform additional tasks,
 such as classifying a particular region of the image.
 General purpose vision systems can learn new tasks quickly and transfer knowledge between tasks. 
 `,
            stepOneInstruction: `
Upload an Image and enter a Task Description, or choose from the examples. Optionally
 adjust the bounding box (works best for classification).`,
            examples: [
                {
                    label: 'Jaguar (Captioning)',
                    img1Src: jaguarYawning,
                    question: 'Caption this image.',
                    cropChanged: false,
                },
                {
                    label: 'City (Captioning)',
                    img1Src: city,
                    question: 'Caption this image.',
                    cropChanged: false,
                },
                {
                    label: 'Stuffed Toy (Question Answering)',
                    img1Src: stuffedMonkey,
                    question: 'What is this stuffed toy?',
                    crop: allImageCrop,
                    cropChanged: false,
                },
                {
                    label: 'Sari (Question Answering)',
                    img1Src: sari,
                    question: 'What is the woman wearing?',
                    cropChanged: false,
                },
                {
                    label: 'Sewing Machine (Classification)',
                    img1Src: sewingMachine,
                    question: 'What is this?',
                    cropChanged: false,
                },
                {
                    label: 'Island by beach (Classification)',
                    img1Src: beach,
                    question: 'What is this?',
                    cropChanged: true,
                    crop: {
                        unit: '%',
                        x: 0,
                        y: 43.33898443924753,
                        width: 35.891898473103836,
                        height: 12.253185071443257,
                    },
                },
                {
                    label: 'Horse on beach (Classification)',
                    img1Src: beach,
                    question: 'What is this?',
                    cropChanged: true,
                    crop: {
                        unit: '%',
                        x: 9.697670406765408,
                        y: 58.071602269222865,
                        width: 26.31450759039985,
                        height: 37.50233700400904,
                    },
                },
            ],
            modelParamComponent: VisualQuestionBboxModelParamControl,
            modelParamComponentUiParams: {
                getInputLabel: (id: string) => {
                    const labels: { [key: string]: JSX.Element } = {
                        question: (
                            <ModelParamLabel>
                                Task Description:{' '}
                                {
                                    <Popover
                                        title="Example task descriptions used for training"
                                        content={
                                            <ul>
                                                <li>
                                                    <b>Captioning</b>
                                                    <ul>
                                                        <li> Generate a caption. </li>
                                                        <li> Generate a description. </li>
                                                        <li> Describe this image. </li>
                                                        <li> Caption this image. </li>
                                                        <li>Generate a caption for this image.</li>
                                                    </ul>
                                                </li>
                                                <li>
                                                    <b>Classification</b>
                                                    <ul>
                                                        <li> What is this? </li>
                                                        <li> What object is this? </li>
                                                        <li> What is this thing? </li>
                                                    </ul>
                                                </li>
                                                <li>
                                                    <b>Classification-in-Context</b>
                                                    <ul>
                                                        <li>
                                                            Classification queries, but with a
                                                            modified bounding box
                                                        </li>
                                                    </ul>
                                                </li>
                                                <li>
                                                    <b>VQA</b>
                                                    <ul>
                                                        <li>What color is the dog?</li>
                                                        <li>What is the woman holding?</li>
                                                    </ul>
                                                </li>
                                            </ul>
                                        }>
                                        <Tag color="blue">examples</Tag>
                                    </Popover>
                                }
                            </ModelParamLabel>
                        ),
                    };
                    return labels[id];
                },
            },
            models: [
                {
                    name: 'GPV 2',
                    markdownDescription: `
A GPV that uses the VinVL object detector and T5 language model. It is trained on data from
the MS COCO dataset for five tasks including classification, localization, visual question answering, 
captioning and classification-in-context.
GPV 2 is also trained on web-search that contains over 10,000 visual concepts so it understands 
a larger range of objects and actions then what appears in MS COCO images.
                    `,
                    answerInfo: GeneralPurposeLanguageAnswerInfo,
                    apiCall: (q: BaseQuery) => baseRequestFrom(q, `api/gpv2`),
                    estimatedApiDuration: 1900,
                },
                {
                    name: 'GPV 1',
                    markdownDescription: `
A GPV that uses the DETR object detector and can be trained in a fully end-to-end manner. 
It is trained with MS COCO data on four different tasks including classification, localization, visual question answering, and captioning. 
This model does not account for the bounding box input.
`,
                    attribution: {
                        paper: {
                            label: 'Towards General Purpose Vision Systems',
                            url: 'https://www.semanticscholar.org/paper/Towards-General-Purpose-Vision-Systems-Gupta-Kamath/f4af0ed90191e0c30955d61595da815200dac544',
                        },
                        publication: 'arXiv',
                        publicationYear: 2021,
                        authors: ['Tanmay Gupta', 'Amita Kamath', 'Ani Kembhavi', 'Derek Hoiem'],
                    },
                    answerInfo: GeneralPurposeLanguageAnswerInfo,
                    apiCall: (q: BaseQuery) => baseRequestFrom(q, `api/gpv`),
                    estimatedApiDuration: 1900,
                },
            ],
        } as AnswerPageProps<GPVProps, VisualQuestionBboxParams>,
    },
    poseEstimation: {
        path: '/pose_estimation',
        label: 'Pose Estimation',
        component: AnswerPage,
        componentProps: {
            title: 'Pose Estimation',
            markdownDescription: `
This task is to estimate 2D or 3D human poses from an image or a video. Human poses are often
represented as a set of body joint coordinates. Accurate estimation of human poses can help many
applications such as action recognition and augmented reality (AR).`,
            examples: [
                {
                    label: 'Baseball Pitching Motion',
                    img1Src: baseballPitchingMotionSrc,
                },
                { label: 'People Exercising', img1Src: exerciseSrc },
                { label: 'Cosplay Crowd', img1Src: dragonBallZCrowdSrc },
                { label: 'Kids Playing Soccer', img1Src: kidsSoccerSrc },
                { label: 'Students at Museum', img1Src: studentsSrc },
            ],
            modelParamComponent: ImageParamControl,
            modelParamComponentUiParams: { getInputLabel: () => null },
            models: [
                {
                    name: 'OpenPose',
                    markdownDescription: `
A standard bottom-up model that supports real-time multi-person 2D pose estimation. Compared to
top-down approaches which detect humans and perform single-person pose estimation for each
detection, this model simultaneously detects and associates human body parts with a set of
non-parametric limb representations called Part Affinity Fields (PAFs).`,
                    attribution: {
                        paper: {
                            label: 'Realtime Multi-Person 2D Pose Estimation using Part Affinity Fields',
                            url: 'https://www.semanticscholar.org/paper/9e8db1519245426f3a78752a3d8360484f4626b1',
                        },
                        publication: 'CVPR',
                        publicationYear: 2017,
                        authors: [
                            'Zhe Cao',
                            'Gines Hidalgo',
                            'Tomáš Šimon',
                            'Shih-En Wei',
                            'Yaser Sheikh',
                        ],
                        repo: {
                            label: 'github',
                            url: 'https://github.com/CMU-Perceptual-Computing-Lab/openpose',
                        },
                    },
                    answerInfo: PoseEstimationAnswerInfo,
                    apiCall: (q: BaseQuery) => baseRequestFrom(q, `api/openpose`),
                    estimatedApiDuration: 2500,
                },
            ],
        } as AnswerPageProps<PoseEstimationAnswer, ImageModelParams>,
    },

    depthEstimation: {
        path: '/depth_estimation',
        label: 'Depths',
        component: AnswerPage,
        componentProps: {
            title: 'Depth Estimation',
            markdownDescription: `
This task is to estimate depth from a single color image. Estimating absolute or relative depth from
camera inputs can help many applications such as scene understanding, self-driving (along with LIDAR
sensors), and augmented reality (AR).`,
            examples: [
                { label: 'Buildings', img1Src: buildingsSrc },
                { label: 'Desert Road', img1Src: desertRoadSrc },
                { label: 'City Street', img1Src: cityStreetSrc },
                { label: 'Island Road', img1Src: islandRoadSrc },
            ],
            modelParamComponent: ImageParamControl,
            modelParamComponentUiParams: { getInputLabel: () => null },
            models: [
                {
                    name: 'MonoDepth2',
                    markdownDescription: `
A self-supervised monocular depth estimation model. The authors propose three improvements on
architecture and self-supervision loss: (1) per-pixel minimum reprojection loss, (2) auto-masking
stationary pixels, and (3) multi-scale estimation.`,
                    attribution: {
                        paper: {
                            label: 'Digging into Self-Supervised Monocular Depth Prediction',
                            url: 'https://www.semanticscholar.org/paper/589cfcb2f995c94b0a98c902cc1f5e0f27cbd927',
                        },
                        publication: 'ICCV',
                        publicationYear: 2019,
                        authors: [
                            'Clément Godard',
                            'Oisin Mac Aodha',
                            'Michael Firman',
                            'Gabriel J. Brostow',
                        ],
                        repo: {
                            label: 'github',
                            url: 'https://github.com/nianticlabs/monodepth2',
                        },
                    },
                    answerInfo: DepthEstimationAnswerInfo,
                    apiCall: (q: BaseQuery) => baseRequestFrom(q, `api/monodepth2`),
                    estimatedApiDuration: 16100,
                },
            ],
        } as AnswerPageProps<DepthEstimationAnswer, ImageModelParams>,
    },

    surfaceNormals: {
        path: '/surface_normals',
        label: 'Surface Normals',
        component: AnswerPage,
        componentProps: {
            title: 'Surface Normals',
            markdownDescription: `
Estimating the normal vectors to all the surfaces in the image. Note that this is highly linked to
depth estimation as the normal of a surface is orthogonal to its change in depth.`,
            examples: [
                { label: 'Tree House', img1Src: treeHouseSrc },
                { label: 'Dining Room', img1Src: diningRoomSrc },
                { label: 'Hallway', img1Src: hallwaySrc },
                { label: 'London Street', img1Src: londonStreetSrc },
            ],
            modelParamComponent: ImageParamControl,
            modelParamComponentUiParams: { getInputLabel: () => null },
            models: [
                {
                    name: 'Multi Task Refinenet',
                    markdownDescription: `
This model used an encoder decoder structure, but the output of this is used to predict 3 very
related aspects of the scene: segmentation, depth and surface normals. For images which only have
annotations for a subset of these, they use an expert model to provide preliminary annotations in
order to avoid having a biased gradient.`,
                    attribution: {
                        paper: {
                            label: 'Real-Time Joint Semantic Segmentation and Depth Estimation Using Asymmetric Annotations',
                            url: 'https://www.semanticscholar.org/paper/435d4b5c30f10753d277848a17baddebd98d3c31',
                        },
                        publication: 'ICRA',
                        publicationYear: 2019,
                        authors: [
                            'Vladimir Nekrasov',
                            'Thanuja Dharmasiri',
                            'Andrew Spek',
                            'Tom Drummond',
                            'Chunhua Shen',
                            'Ian D. Reid',
                        ],
                        repo: {
                            label: 'github',
                            url: 'https://github.com/DrSleep/multi-task-refinenet',
                        },
                    },
                    answerInfo: SurfaceNormalEstimationAnswerInfo,
                    apiCall: (q: BaseQuery) => baseRequestFrom(q, `api/multitaskrefinenet`),
                    estimatedApiDuration: 77900,
                },
            ],
        } as AnswerPageProps<SurfaceNormalEstimationAnswer, ImageModelParams>,
    },
};

export const categories: AppRouteGroup[] = [
    {
        label: 'Recognition',
        iconSrc: recognitionIcon,
        routes: [tasks.classification, tasks.objectDetection, tasks.segmentation, tasks.periapical],
    },
    {
        label: 'Vision and Language',
        iconSrc: visionIcon,
        routes: [
            tasks.imageGeneration,
            tasks.visualQuestion,
            tasks.situationRecognition,
            tasks.groundedSituationRecognition,
            tasks.captioning,
        ],
    },
    {
        label: 'Multi-Task',
        iconSrc: multiTaskIcon,
        routes: [tasks.generalPurposeLanguage],
    },
    {
        label: 'Human Centric Vision',
        iconSrc: poseIcon,
        routes: [tasks.poseEstimation],
    },
    {
        label: 'Scene Geometry',
        iconSrc: surfaceNormalsIcon,
        routes: [tasks.depthEstimation, tasks.surfaceNormals],
    },
];
