/
ai-process.json
6766 lines (6766 loc) · 370 KB
/
ai-process.json
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
{
"name": "AI process",
"children": [
{
"name": "Vision process",
"children": [
{
"name": "Image tagging",
"description": "Annotate an image of some sort, typically with terms from a controlled vocabulary.",
"has_input": [
"Image"
],
"has_output": [
"Text"
],
"children": [
{
"name": "Image captioning",
"description": "Image captioning is the automatic generation of natural language descriptions of the content of an input image.",
"has_input": [
"Image"
],
"has_output": [
"Text"
],
"hasExactSynonym": "['Automatic image annotation']",
"children": [
{
"name": "Phrase grounding",
"description": "Assigning each entity mentioned in a given caption of an image to a corresponding location in the respective image.",
"has_input": [
"Image"
]
},
{
"name": "Relational captioning"
}
]
},
{
"name": "Meme classification",
"description": "Meme classification refers to the task of classifying internet memes.",
"has_input": [
"Image"
]
}
]
},
{
"name": "Optical character recognition",
"children": [
{
"name": "Handwriting recognition",
"children": [
{
"name": "Handwritten digit recognition"
}
]
},
{
"name": "Active learning",
"description": "**Active Learning** is a paradigm in supervised machine learning which uses fewer training examples to achieve better optimization by iteratively training a predictor, and using the predictor in each iteration to choose the training examples which will increase its chances of finding better configurations and at the same time improving the accuracy of the prediction model (Source: paperswithcode.com)",
"children": [
{
"name": "Active object detection",
"description": "Active Learning for Object Detection (Source: paperswithcode.com)"
}
]
}
]
},
{
"name": "Trajectory prediction",
"description": "**Trajectory Prediction** is the problem of predicting the short-term (1-3 seconds) and long-term (3-5 seconds) spatial coordinates of various road-agents such as cars, buses, pedestrians, rickshaws, and animals, etc. These road-agents have different dynamic behaviors that may correspond to aggressive or conservative driving styles. (Source: paperswithcode.com)",
"children": [
{
"name": "Trajectory forecasting",
"description": "Trajectory forecasting is a sequential prediction task, where a forecasting model predicts future trajectories of all moving agents (humans, vehicles, etc.) in a scene, based on their past trajectories and/or the scene context.\n\n(Illustrative figure from [Social NCE: Contrastive Learning of Socially-aware Motion Representations](https://github.com/vita-epfl/social-nce)) (Source: paperswithcode.com)"
}
]
},
{
"name": "Depth estimation",
"description": "**Depth Estimation** is a crucial step towards inferring scene geometry from 2D images. The goal in monocular Depth Estimation is to predict the depth value of each pixel, given only a single RGB image as input. (Source: paperswithcode.com)",
"children": [
{
"name": "Monocular depth estimation",
"description": "The **Monocular Depth Estimation** is the task of estimating scene depth using a single image. (Source: paperswithcode.com)"
},
{
"name": "Stereo depth estimation"
},
{
"name": "Indoor monocular depth estimation"
},
{
"name": "Stereo-lidar fusion",
"description": "Depth estimation using stereo cameras and a LiDAR sensor. (Source: paperswithcode.com)"
}
]
},
{
"name": "Super-resolution",
"description": "Super resolution is the task of taking an input of a low resolution (LR) and upscaling it to that of a high resolution. (Source: paperswithcode.com)",
"children": [
{
"name": "Image super-resolution",
"description": "Image super-resolution (SR) techniques reconstruct a higher-resolution image or sequence from the observed lower-resolution images. Usually the benchmarks are single-image super-resolution (SISR) tasks. (Source: paperswithcode.com)",
"children": [
{
"name": "Multi-frame super-resolution",
"description": "When multiple images of the same view are taken from slightly different positions, perhaps also at different times, then they collectively contain more information than any single image on its own. Multi-Frame Super-Resolution fuses these low-res inputs into a composite high-res image that can reveal some of the original detail that cannot be recovered from any low-res image alone. (Source: paperswithcode.com)"
},
{
"name": "Audio super-resolution",
"description": "AUDIO SUPER-RESOLUTION or speech bandwidth extension (Upsampling Ratio = 2) (Source: paperswithcode.com)"
}
]
},
{
"name": "Video super-resolution",
"description": "Video super-resolution is the task of upscaling a video from a low-resolution to a high-resolution. (Source: paperswithcode.com)"
}
]
},
{
"name": "Video process",
"children": [
{
"name": "Video super-resolution",
"description": "Video super-resolution is the task of upscaling a video from a low-resolution to a high-resolution. (Source: paperswithcode.com)"
},
{
"name": "Video understanding",
"description": "A crucial task of **Video Understanding** is to recognise and localise (in space and time) different actions or events appearing in the video. (Source: paperswithcode.com)",
"children": [
{
"name": "Video alignment"
}
]
},
{
"name": "Object tracking",
"description": "Object tracking is the task of taking an initial set of object detections, creating a unique ID for each of the initial detections, and then tracking each of the objects as they move around frames in a video, maintaining the ID assignment. (Source: paperswithcode.com)",
"children": [
{
"name": "Visual object tracking",
"description": "**Visual Object Tracking** is an important research topic in computer vision, image understanding and pattern recognition. Given the initial state (centre location and scale) of a target in the first frame of a video sequence, the aim of Visual Object Tracking is to automatically obtain the states of the object in the subsequent video frames. (Source: paperswithcode.com)"
},
{
"name": "Multi-object tracking",
"children": [
{
"name": "3D multi-object tracking",
"description": "Image: [Weng et al](https://arxiv.org/pdf/1907.03961v4.pdf) (Source: paperswithcode.com)"
}
]
},
{
"name": "Multiple object tracking",
"description": "**Multiple Object Tracking** is the problem of automatically identifying multiple objects in a video and representing them as a set of trajectories with high accuracy. (Source: paperswithcode.com)"
},
{
"name": "Online multi-object tracking",
"description": "The goal of **Online Multi-Object Tracking** is to estimate the spatio-temporal trajectories of multiple objects in an online video stream (i.e., the video is provided frame-by-frame), which is a fundamental problem for numerous real-time applications, such as video surveillance, autonomous driving, and robot navigation. (Source: paperswithcode.com)"
},
{
"name": "3D object tracking"
}
]
},
{
"name": "Video prediction",
"description": "**Video Prediction** is the task of predicting future frames given past video frames. (Source: paperswithcode.com)",
"has_input": [
"Video"
],
"has_output": [
"Video"
]
},
{
"name": "Action classification",
"children": [
{
"name": "Skeleton based action recognition"
}
]
},
{
"name": "Video classification",
"description": "**Video Classification** is the task of producing a label that is relevant to the video given its frames. A good video level classifier is one that not only provides accurate frame labels, but also best describes the entire video given the features and the annotations of the various frames in the video. For example, a video might contain a tree in some frame, but the label that is central to the video might be something else (e.g., \u201chiking\u201d). The granularity of the labels that are needed to describe the frames and the video depends on the task. Typical tasks include assigning one or more global labels to the video, and assigning one or more labels for each frame inside the video. (Source: paperswithcode.com)",
"children": [
{
"name": "Action recognition in videos",
"children": [
{
"name": "Self-supervised action recognition"
}
]
},
{
"name": "Action recognition",
"description": "Please note some benchmarks may be located in the [Action Classification](https://paperswithcode.com/task/action-classification) or [Video Classification](https://paperswithcode.com/task/video-classification) tasks, e.g. Kinetics-400. (Source: paperswithcode.com)",
"children": [
{
"name": "3D action recognition",
"description": "Image: [Rahmani et al](https://www.cv-foundation.org/openaccess/content_cvpr_2016/papers/Rahmani_3D_Action_Recognition_CVPR_2016_paper.pdf) (Source: paperswithcode.com)",
"children": [
{
"name": "Skeleton based action recognition"
},
{
"name": "Zero shot skeletal action recognition",
"description": "Zero-Shot Learning for 3D skeletal action recognition (Source: paperswithcode.com)",
"children": [
{
"name": "Generalized zero shot skeletal action recognition",
"description": "Generalized Zero Shot Learning for 3d Skeletal Action Recognition (Source: paperswithcode.com)"
}
]
}
]
},
{
"name": "Self-supervised action recognition"
},
{
"name": "Action triplet recognition",
"description": "Recognising action as a triplet of subject verb and object. Example HOI = Human Object Interaction, Surgical IVT = Instrument Verb Target, etc. (Source: paperswithcode.com)"
}
]
}
]
},
{
"name": "Video generation"
},
{
"name": "Video frame interpolation",
"description": "The goal of **Video Frame Interpolation** is to synthesize several frames in the middle of two adjacent frames of the original video. Video Frame Interpolation can be applied to generate slow motion video, increase video frame rate, and frame recovery in video streaming. (Source: paperswithcode.com)"
},
{
"name": "Video retrieval",
"description": "The objective of video retrieval is as follows: given a text query and a pool of candidate videos, select the video which corresponds to the text query. Typically, the videos are returned as a ranked list of candidates and scored via document retrieval metrics. (Source: paperswithcode.com)",
"children": [
{
"name": "Replay grounding",
"description": "Replay grounding is introduced in SoccerNet-v2 in the case of videos of soccer games. Given a replay shot of a soccer action, the objective is to retrieve when said action occurs within the whole live game. (Source: paperswithcode.com)"
}
]
},
{
"name": "Video denoising"
},
{
"name": "Video summarization",
"description": "**Video Summarization** is the process of compacting a video down to only important components in the video. (Source: paperswithcode.com)",
"children": [
{
"name": "Unsupervised video summarization"
},
{
"name": "Supervised video summarization"
}
]
},
{
"name": "Video-to-video synthesis",
"description": "Learning a mapping function from an input source video to an output video. (Source: paperswithcode.com)"
},
{
"name": "Activity recognition in videos",
"children": [
{
"name": "Activity prediction",
"description": "Predict human activities in videos (Source: paperswithcode.com)"
}
]
},
{
"name": "Anomaly detection in surveillance videos"
},
{
"name": "Abnormal event detection in video",
"description": "**Abnormal Event Detection In Video** is a challenging task in computer vision, as the definition of what an abnormal event looks like depends very much on the context. For instance, a car driving by on the street is regarded as a normal event, but if the car enters a pedestrian area, this is regarded as an abnormal event. A person running on a sports court (normal event) versus running outside from a bank (abnormal event) is another example. Although what is considered abnormal depends on the context, we can generally agree that abnormal events should be unexpected events that occur less often than familiar (normal) events (Source: paperswithcode.com)",
"children": [
{
"name": "Anomaly detection in surveillance videos"
},
{
"name": "Semi-supervised anomaly detection"
}
]
},
{
"name": "Action spotting"
},
{
"name": "Video deinterlacing"
},
{
"name": "Video story qa",
"description": "MCQ about clips from movies/tvshows/etc (Source: paperswithcode.com)"
},
{
"name": "Video object segmentation",
"description": "Video object segmentation is a binary labeling problem aiming to separate foreground object(s) from the background region of a video.\n\nFor leaderboards please refer to the different subtasks. (Source: paperswithcode.com)",
"children": [
{
"name": "Video salient object detection",
"description": "Video salient object detection (VSOD) is significantly essential for understanding the underlying mechanism behind HVS during free-viewing in general and instrumental to a wide range of real-world applications, e.g., video segmentation, video captioning, video compression, autonomous driving, robotic interaction, weakly supervised attention. Besides its academic value and practical significance, VSOD presents great difficulties due to the challenges carried by video data (diverse motion patterns, occlusions, blur, large object deformations, etc.) and the inherent complexity of human visual attention behavior (i.e., selective attention allocation, attention shift) during dynamic scenes. Online benchmark: http://dpfan.net/davsod. (Source: paperswithcode.com)"
},
{
"name": "Unsupervised video object segmentation",
"description": "The unsupervised scenario assumes that the user does not interact with the algorithm to obtain the segmentation masks. Methods should provide a set of object candidates with no overlapping pixels that span through the whole video sequence. This set of objects should contain at least the objects that capture human attention when watching the whole video sequence i.e objects that are more likely to be followed by human gaze. (Source: paperswithcode.com)"
},
{
"name": "Semi-supervised video object segmentation",
"description": "The semi-supervised scenario assumes the user inputs a full mask of the object(s) of interest in the first frame of a video sequence. Methods have to produce the segmentation mask for that object(s) in the subsequent frames. (Source: paperswithcode.com)",
"children": [
{
"name": "One-shot visual object segmentation"
}
]
},
{
"name": "Interactive video object segmentation",
"description": "The interactive scenario assumes the user gives iterative refinement inputs to the algorithm, in our case in the form of a scribble, to segment the objects of interest. Methods have to produce a segmentation mask for that object in all the frames of a video sequence taking into account all the user interactions. (Source: paperswithcode.com)"
}
]
},
{
"name": "SpO2 estimation",
"description": "SpO2 estimation (Source: paperswithcode.com)"
}
]
},
{
"name": "Reconstruction",
"children": [
{
"name": "Single-view 3D reconstruction"
},
{
"name": "Single-image-based Hdr reconstruction"
},
{
"name": "3D human reconstruction"
}
]
},
{
"name": "Facial recognition and modelling",
"description": "Facial tasks in machine learning operate based on images or video frames (or other datasets) focussed on human faces. (Source: paperswithcode.com)",
"children": [
{
"name": "Face recognition",
"description": "Facial recognition is the task of making a positive identification of a face in a photo or video image against a pre-existing database of faces. It begins with detection - distinguishing human faces from other objects in the image - and then works on identification of those detected faces.\n\nThe state of the art tables for this task are contained mainly in the consistent parts of the task : the face verification and face identification tasks. (Source: paperswithcode.com)",
"children": [
{
"name": "Age-invariant face recognition",
"description": "Age-invariant face recognition is the task of performing face recognition that is invariant to differences in age. (Source: paperswithcode.com)"
},
{
"name": "Heterogeneous face recognition",
"description": "Heterogeneous face recognition is the task of matching face images acquired from different sources (i.e., different sensors or different wavelengths) for identification or verification. (Source: paperswithcode.com)"
},
{
"name": "Face quality assessement",
"description": "Estimate the usability of a given face image for recognition (Source: paperswithcode.com)"
}
]
},
{
"name": "Face detection",
"description": "Face detection is the task of detecting faces in a photo or video (and distinguishing them from other objects). (Source: paperswithcode.com)",
"children": [
{
"name": "Occluded face detection"
}
]
},
{
"name": "Face alignment",
"description": "Face alignment is the task of identifying the geometric structure of faces in digital images, and attempting to obtain a canonical alignment of the face based on translation, scale, and rotation. (Source: paperswithcode.com)"
},
{
"name": "Face verification",
"description": "Face verification is the task of comparing a candidate face to another, and verifying whether it is a match. It is a one-to-one mapping: you have to check if this person is the correct one. (Source: paperswithcode.com)",
"children": [
{
"name": "Disguised face verification"
}
]
},
{
"name": "Facial expression recognition",
"description": "Facial expression recognition is the task of classifying the expressions on face images into various categories such as anger, fear, surprise, sadness, happiness and so on. (Source: paperswithcode.com)",
"children": [
{
"name": "3D facial expression recognition",
"description": "3D facial expression recognition is the task of modelling facial expressions in 3D from an image or video. (Source: paperswithcode.com)"
},
{
"name": "Smile recognition",
"description": "Smile recognition is the task of recognising a smiling face in a photo or video. (Source: paperswithcode.com)"
}
]
},
{
"name": "Facial landmark detection",
"description": "Facial landmark detection is the task of detecting key landmarks on the face and tracking them (being robust to rigid and non-rigid facial deformations due to head movements and facial expressions). (Source: paperswithcode.com)",
"children": [
{
"name": "Unsupervised facial landmark detection",
"description": "Facial landmark detection in the unsupervised setting popularized by [1]. The evaluation occurs in two stages:\n(1) Embeddings are first learned in an unsupervised manner (i.e. without labels);\n(2) A simple regressor is trained to regress landmarks from the unsupervised embedding.\n\n[1] Thewlis, James, Hakan Bilen, and Andrea Vedaldi. \"Unsupervised learning of object landmarks by factorized spatial embeddings.\" Proceedings of the IEEE International Conference on Computer Vision. 2017. (Source: paperswithcode.com)"
},
{
"name": "3D facial landmark localization",
"description": "Image: [Zhang et al](https://arxiv.org/pdf/1801.09242v1.pdf) (Source: paperswithcode.com)"
}
]
},
{
"name": "Face reconstruction",
"description": "Face reconstruction is the task of recovering the facial geometry of a face from an image. (Source: paperswithcode.com)",
"children": [
{
"name": "3D face reconstruction",
"description": "3D face reconstruction is the task of reconstructing a face from an image into a 3D form (or mesh). (Source: paperswithcode.com)"
}
]
},
{
"name": "Face identification",
"description": "Face identification is the task of matching a given face image to one in an existing database of faces. It is the second part of face recognition (the first part being detection). It is a one-to-many mapping: you have to find an unknown person in a database to find who that person is. (Source: paperswithcode.com)"
},
{
"name": "Face swapping",
"description": "Face swapping refers to the task of swapping faces between images or in an video, while maintaining the rest of the body and environment context. (Source: paperswithcode.com)"
},
{
"name": "Facial action unit detection",
"description": "Facial action unit detection is the task of detecting action units from a video of a face - for example, lip tightening and cheek raising. (Source: paperswithcode.com)"
},
{
"name": "Action unit detection",
"description": "Action unit detection is the task of detecting action units from a video - for example, types of facial action units (lip tightening, cheek raising) from a video of a face. (Source: paperswithcode.com)"
},
{
"name": "Age estimation",
"description": "Age Estimation is the task of estimating the age of a person from an image. (Source: paperswithcode.com)"
},
{
"name": "Gender prediction"
},
{
"name": "Face hallucination",
"description": "Face hallucination is the task of generating high-resolution (HR) facial images from low-resolution (LR) inputs. (Source: paperswithcode.com)"
},
{
"name": "Facial beauty prediction",
"description": "Facial beauty prediction is the task of predicting the attractiveness of a face. (Source: paperswithcode.com)"
},
{
"name": "Facial attribute classification",
"description": "Facial attribute classification is the task of classifying various attributes of a facial image - e.g. whether someone has a beard, is wearing a hat, and so on. (Source: paperswithcode.com)"
},
{
"name": "Age and gender classification",
"description": "Age and gender classification is a dual-task of identifying the age and gender of a person from an image or video. (Source: paperswithcode.com)"
}
]
},
{
"name": "Face anti-spoofing"
},
{
"name": "Deception detection",
"children": [
{
"name": "Face anti-spoofing"
}
]
},
{
"name": "Sketch",
"children": [
{
"name": "Face sketch synthesis",
"description": "Face sketch synthesis is the task of generating a sketch from an input face photo. (Source: paperswithcode.com)"
}
]
},
{
"name": "Autonomous vehicle task",
"description": "Autonomous vehicles is the task of making a vehicle that can guide itself without human conduction.\n\nMany of the state-of-the-art results can be found at more general task pages such as [3D Object Detection](https://paperswithcode.com/task/3d-object-detection) and [Semantic Segmentation](https://paperswithcode.com/task/semantic-segmentation). (Source: paperswithcode.com)",
"children": [
{
"name": "Autonomous driving",
"description": "Autonomous driving is the task of driving a vehicle without human conduction. \n\nMany of the state-of-the-art results can be found at more general task pages such as [3D Object Detection](https://paperswithcode.com/task/3d-object-detection) and [Semantic Segmentation](https://paperswithcode.com/task/semantic-segmentation). (Source: paperswithcode.com)",
"children": [
{
"name": "Motion forecasting",
"description": "Motion forecasting is the task of predicting the location of a tracked object in the future (Source: paperswithcode.com)",
"children": [
{
"name": "Multiple object forecasting"
}
]
}
]
},
{
"name": "Autonomous navigation",
"description": "Autonomous navigation is the task of autonomously navigating a vehicle or robot to or around a location without human guidance. (Source: paperswithcode.com)",
"children": [
{
"name": "Autonomous flight (dense forest)",
"description": "Number of interventions during autonomous flight under the forest canopy. (Source: paperswithcode.com)"
}
]
},
{
"name": "Pedestrian detection",
"description": "Pedestrian detection is the task of detecting pedestrians from a camera.\n\nFurther state-of-the-art results (e.g. on the KITTI dataset) can be found at [3D Object Detection](https://paperswithcode.com/task/object-detection). (Source: paperswithcode.com)"
},
{
"name": "Lane detection",
"description": "Lane detection is the task of detecting lanes on a road from a camera. (Source: paperswithcode.com)"
},
{
"name": "Traffic sign recognition",
"description": "Traffic sign recognition is the task of recognising traffic signs in an image or video. (Source: paperswithcode.com)"
},
{
"name": "Pedestrian attribute recognition",
"description": "Pedestrian attribution recognition is the task of recognising pedestrian features - such as whether they are talking on a phone, whether they have a backpack, and so on. (Source: paperswithcode.com)"
},
{
"name": "3D car instance understanding",
"description": "3D Car Instance Understanding is the task of estimating properties (e.g.translation, rotation and shape) of a moving or parked vehicle on the road. (Source: paperswithcode.com)",
"children": [
{
"name": "Hand gesture recognition",
"children": [
{
"name": "Skeleton based action recognition"
}
]
}
]
}
]
},
{
"name": "3D vision process",
"children": [
{
"name": "Motion forecasting",
"description": "Motion forecasting is the task of predicting the location of a tracked object in the future (Source: paperswithcode.com)",
"children": [
{
"name": "Multiple object forecasting"
}
]
},
{
"name": "3D car instance understanding",
"description": "3D Car Instance Understanding is the task of estimating properties (e.g.translation, rotation and shape) of a moving or parked vehicle on the road. (Source: paperswithcode.com)",
"children": [
{
"name": "Hand gesture recognition",
"children": [
{
"name": "Skeleton based action recognition"
}
]
}
]
},
{
"name": "3D human pose estimation",
"children": [
{
"name": "Pose prediction",
"description": "Pose prediction is to predict future poses given a window of previous poses. (Source: paperswithcode.com)"
},
{
"name": "3D absolute human pose estimation",
"description": "This task aims to solve absolute (camera-centric not root-relative) 3D human pose estimation. (Source: paperswithcode.com)",
"children": [
{
"name": "3D face animation",
"description": "Image: [Cudeiro et al](https://arxiv.org/pdf/1905.03079v1.pdf) (Source: paperswithcode.com)",
"children": [
{
"name": "Image generation",
"description": "Image generation (synthesis) is the task of generating new images from an existing dataset.\n\n- **Unconditional generation** refers to generating samples unconditionally from the dataset, i.e. $p(y)$\n- **Conditional image generation** (subtask) refers to generating samples conditionally from the dataset, based on a label, i.e. $p(y|x)$.\n\nIn this section, you can find state-of-the-art leaderboards for **unconditional generation**. For conditional generation, and other types of image generations, refer to the subtasks. (Source: paperswithcode.com)",
"children": [
{
"name": "Face generation",
"description": "Face generation is the task of generating (or interpolating) new faces from an existing dataset.\n\nThe state-of-the-art results for this task are located in the Image Generation parent. (Source: paperswithcode.com)",
"children": [
{
"name": "Talking face generation",
"description": "Talking face generation aims to synthesize a sequence of face images that correspond to given speech semantics. (Source: paperswithcode.com)"
},
{
"name": "Talking head generation",
"description": "Talking head generation is the task of generating a talking face from a set of images of a person. (Source: paperswithcode.com)",
"children": [
{
"name": "Unconstrained lip-synchronization",
"description": "Given a video of an arbitrary person, and an arbitrary driving speech, the task is to generate a lip-synced video that matches the given speech. \n\nThis task requires the approach to not be constrained by identity, voice, or language. (Source: paperswithcode.com)"
}
]
}
]
},
{
"name": "Image inpainting",
"description": "**Image Inpainting** is a task of reconstructing missing regions in an image. It is an important problem in computer vision and an essential functionality in many imaging and graphics applications, e.g. object removal, image restoration, manipulation, re-targeting, compositing, and image-based rendering. (Source: paperswithcode.com)",
"children": [
{
"name": "Facial inpainting",
"description": "Facial inpainting (or face completion) is the task of generating plausible facial structures for missing pixels in a face image. (Source: paperswithcode.com)"
},
{
"name": "Image outpainting",
"description": "Predicting the visual context of an image beyond its boundary. (Source: paperswithcode.com)"
},
{
"name": "Cloud removal"
}
]
},
{
"name": "Image-to-image translation",
"description": "Image-to-image translation is the task of taking images from one domain and transforming them so they have the style (or characteristics) of images from another domain. (Source: paperswithcode.com)",
"children": [
{
"name": "Unsupervised image-to-image translation",
"description": "Unsupervised image-to-image translation is the task of doing image-to-image translation without ground truth image-to-image pairings. (Source: paperswithcode.com)"
},
{
"name": "Synthetic-to-real translation",
"description": "Synthetic-to-real translation is the task of domain adaptation from synthetic (or virtual) data to real data. (Source: paperswithcode.com)"
},
{
"name": "Multimodal unsupervised image-to-image translation",
"description": "Multimodal unsupervised image-to-image translation is the task of producing multiple translations to one domain from a single image in another domain. (Source: paperswithcode.com)"
},
{
"name": "Cross-view image-to-image translation"
},
{
"name": "Facial makeup transfer",
"description": "Facial makeup transfer aims to translate the **makeup style** from a given *reference* makeup face image to another non-makeup one while *preserving face identity*. (Source: paperswithcode.com)"
},
{
"name": "Fundus to angiography generation",
"description": "Generating Retinal Fluorescein Angiography from Retinal Fundus Image using Generative Adversarial Networks. (Source: paperswithcode.com)"
}
]
},
{
"name": "Conditional image generation",
"description": "Conditional image generation is the task of generating new images from a dataset conditional on their class. (Source: paperswithcode.com)"
},
{
"name": "Text-to-image generation",
"has_input": [
"Text"
],
"has_output": [
"Image"
],
"children": [
{
"name": "Zero-shot text-to-image generation"
}
]
},
{
"name": "Pose transfer"
},
{
"name": "Layout-to-image generation",
"description": "Layout-to-image generation its the task to generate a scene based on the given layout. The layout describes the location of the objects to be included in the output image.\nIn this section, you can find state-of-the-art leaderboards for Layout-to-image generation. (Source: paperswithcode.com)"
}
]
}
]
}
]
},
{
"name": "3D multi-person pose estimation",
"description": "This task aims to solve root-relative 3D multi-person pose estimation. No human bounding box and root joint coordinate groundtruth are used in testing time. (Source: paperswithcode.com)",
"children": [
{
"name": "3D multi-person pose estimation (root-relative)",
"description": "This task aims to solve root-relative 3D multi-person pose estimation (person-centric coordinate system). No ground truth human bounding box and human root joint coordinates are used during testing stage. (Source: paperswithcode.com)"
},
{
"name": "3D multi-person pose estimation (absolute)",
"description": "This task aims to solve absolute 3D multi-person pose Estimation (camera-centric coordinates). No ground truth human bounding box and human root joint coordinates are used during testing stage. (Source: paperswithcode.com)"
}
]
},
{
"name": "Monocular 3D human pose estimation",
"description": "This task targets at 3D human pose estimation with a single RGB camera. (Source: paperswithcode.com)"
},
{
"name": "Weakly-supervised 3D human pose estimation",
"description": "This task targets at 3D Human Pose Estimation with fewer 3D annotation. (Source: paperswithcode.com)"
}
]
},
{
"name": "3D reconstruction",
"description": "Image: [Gwak et al](https://arxiv.org/pdf/1705.10904v2.pdf) (Source: paperswithcode.com)",
"children": [
{
"name": "3D scene reconstruction",
"children": [
{
"name": "Face generation",
"description": "Face generation is the task of generating (or interpolating) new faces from an existing dataset.\n\nThe state-of-the-art results for this task are located in the Image Generation parent. (Source: paperswithcode.com)",
"children": [
{
"name": "Talking face generation",
"description": "Talking face generation aims to synthesize a sequence of face images that correspond to given speech semantics. (Source: paperswithcode.com)"
},
{
"name": "Talking head generation",
"description": "Talking head generation is the task of generating a talking face from a set of images of a person. (Source: paperswithcode.com)",
"children": [
{
"name": "Unconstrained lip-synchronization",
"description": "Given a video of an arbitrary person, and an arbitrary driving speech, the task is to generate a lip-synced video that matches the given speech. \n\nThis task requires the approach to not be constrained by identity, voice, or language. (Source: paperswithcode.com)"
}
]
}
]
}
]
},
{
"name": "3D room layouts from a single rgb panorama",
"description": "Image: [Zou et al](https://arxiv.org/pdf/1803.08999v1.pdf) (Source: paperswithcode.com)"
}
]
},
{
"name": "3D human action recognition",
"children": [
{
"name": "Skeleton based action recognition"
}
]
},
{
"name": "3D pose estimation"
},
{
"name": "Playing FPS games",
"description": "First-person shooter (FPS) games Involve like call of duty so enjoy (Source: paperswithcode.com)",
"children": [
{
"name": "Playing game of Doom",
"description": "Doom is an FPS game : the task is typically to train an agent to navigate the game environment, and additionally, acquire points by eliminating enemies. (Source: paperswithcode.com)"
}
]
},
{
"name": "3D object classification",
"description": "Image: [Sedaghat et al](https://arxiv.org/pdf/1604.03351v2.pdf) (Source: paperswithcode.com)"
},
{
"name": "3D point cloud matching",
"description": "Image: [Gojic et al](https://openaccess.thecvf.com/content_CVPR_2019/papers/Gojcic_The_Perfect_Match_3D_Point_Cloud_Matching_With_Smoothed_Densities_CVPR_2019_paper.pdf) (Source: paperswithcode.com)"
},
{
"name": "3D feature matching",
"description": "Image: [Choy et al](https://paperswithcode.com/paper/fully-convolutional-geometric-features) (Source: paperswithcode.com)"
},
{
"name": "3D shape modeling",
"description": "Image: [Gkioxari et al](https://arxiv.org/pdf/1906.02739v2.pdf) (Source: paperswithcode.com)"
},
{
"name": "Video reconstruction"
},
{
"name": "3D shape reconstruction",
"description": "Image credit: [GSNet: Joint Vehicle Pose and Shape Reconstruction with Geometrical and Scene-aware Supervision\n, ECCV'20](https://www.ecva.net/papers/eccv_2020/papers_ECCV/papers/123600511.pdf) (Source: paperswithcode.com)",
"children": [
{
"name": "3D shape reconstruction from a single 2D image",
"description": "Image: [Liao et al](https://arxiv.org/pdf/1811.12016v1.pdf) (Source: paperswithcode.com)"
}
]
},
{
"name": "3D shape classification",
"description": "Image: [Sun et al](https://arxiv.org/pdf/1804.04610v1.pdf) (Source: paperswithcode.com)"
},
{
"name": "3D face modeling"
},
{
"name": "Occluded 3D object symmetry detection"
},
{
"name": "3D object tracking"
}
]
},
{
"name": "Domain adaptation",
"description": "Domain adaptation is the task of adapting models across domains. (Source: paperswithcode.com)",
"children": [
{
"name": "Unsupervised domain adaptation",
"description": "**Unsupervised Domain Adaptation** is a learning framework to transfer knowledge learned from source domains with a large number of annotated training examples to target domains with unlabeled data only. (Source: paperswithcode.com)"
},
{
"name": "Domain generalization",
"description": "The idea of **Domain Generalization** is to learn from one or multiple training domains, to extract a domain-agnostic model which can be applied to an unseen domain (Source: paperswithcode.com)"
},
{
"name": "Partial domain adaptation",
"description": "**Partial Domain Adaptation** is a transfer learning paradigm, which manages to transfer relevant knowledge from a large-scale source domain to a small-scale target domain. (Source: paperswithcode.com)"
},
{
"name": "Continuously indexed domain adaptation",
"description": "Continuously indexed domain adaptation adapts across continuously indexed domains, e.g., across patients of different ages, where 'age' is a continuous notion. (Source: paperswithcode.com)"
},
{
"name": "Wildly unsupervised domain adaptation",
"description": "Transferring knowledge from a noisy source domain to unlabeled target domain. (Source: paperswithcode.com)"
}
]
},
{
"name": "Visual speech recognition",
"has_input": [
"Video"
],
"children": [
{
"name": "Lip to speech synthesis",
"description": "Given a silent video of a speaker, generate the corresponding speech that matches the lip movements. (Source: paperswithcode.com)",
"children": [
{
"name": "Speaker-specific lip to speech synthesis",
"description": "How accurately can we infer an individual\u2019s speech style and content from his/her lip movements? [1]\n\nIn this task, the model is trained on a specific speaker, or a very limited set of speakers. \n\n[1] Learning Individual Speaking Styles for Accurate Lip to Speech Synthesis, CVPR 2020. (Source: paperswithcode.com)"
}
]
}
]
},
{
"name": "Visual dialog",
"description": "Visual Dialog requires an AI agent to hold a meaningful dialog with humans in natural, conversational language about visual content. Specifically, given an image, a dialog history, and a follow-up question about the image, the task is to answer the question. (Source: paperswithcode.com)"
},
{
"name": "Person search",
"description": "**Person Search** is a task which aims at matching a specific person among a great number of whole scene images. (Source: paperswithcode.com)"
},
{
"name": "Pose estimation",
"description": "Pose Estimation is a general problem in Computer Vision where we detect the position and orientation of an object. (Source: paperswithcode.com)",
"children": [
{
"name": "2D human pose estimation",
"children": [
{
"name": "Deblurring",
"children": [
{
"name": "Blind image deblurring",
"description": "**Blind Image Deblurring** is a classical problem in image processing and computer vision, which aims to recover a latent image from a blurred input. (Source: paperswithcode.com)"
},
{
"name": "Single-image blind deblurring"
}
]
}
]
},
{
"name": "3D human pose estimation",
"children": [
{
"name": "Pose prediction",
"description": "Pose prediction is to predict future poses given a window of previous poses. (Source: paperswithcode.com)"
},
{
"name": "3D absolute human pose estimation",
"description": "This task aims to solve absolute (camera-centric not root-relative) 3D human pose estimation. (Source: paperswithcode.com)",
"children": [
{
"name": "3D face animation",
"description": "Image: [Cudeiro et al](https://arxiv.org/pdf/1905.03079v1.pdf) (Source: paperswithcode.com)",
"children": [
{
"name": "Image generation",
"description": "Image generation (synthesis) is the task of generating new images from an existing dataset.\n\n- **Unconditional generation** refers to generating samples unconditionally from the dataset, i.e. $p(y)$\n- **Conditional image generation** (subtask) refers to generating samples conditionally from the dataset, based on a label, i.e. $p(y|x)$.\n\nIn this section, you can find state-of-the-art leaderboards for **unconditional generation**. For conditional generation, and other types of image generations, refer to the subtasks. (Source: paperswithcode.com)",
"children": [
{
"name": "Face generation",
"description": "Face generation is the task of generating (or interpolating) new faces from an existing dataset.\n\nThe state-of-the-art results for this task are located in the Image Generation parent. (Source: paperswithcode.com)",
"children": [
{
"name": "Talking face generation",
"description": "Talking face generation aims to synthesize a sequence of face images that correspond to given speech semantics. (Source: paperswithcode.com)"
},
{
"name": "Talking head generation",
"description": "Talking head generation is the task of generating a talking face from a set of images of a person. (Source: paperswithcode.com)",
"children": [
{
"name": "Unconstrained lip-synchronization",
"description": "Given a video of an arbitrary person, and an arbitrary driving speech, the task is to generate a lip-synced video that matches the given speech. \n\nThis task requires the approach to not be constrained by identity, voice, or language. (Source: paperswithcode.com)"
}
]
}
]
},
{
"name": "Image inpainting",
"description": "**Image Inpainting** is a task of reconstructing missing regions in an image. It is an important problem in computer vision and an essential functionality in many imaging and graphics applications, e.g. object removal, image restoration, manipulation, re-targeting, compositing, and image-based rendering. (Source: paperswithcode.com)",
"children": [
{
"name": "Facial inpainting",
"description": "Facial inpainting (or face completion) is the task of generating plausible facial structures for missing pixels in a face image. (Source: paperswithcode.com)"
},
{
"name": "Image outpainting",
"description": "Predicting the visual context of an image beyond its boundary. (Source: paperswithcode.com)"
},
{
"name": "Cloud removal"
}
]
},
{
"name": "Image-to-image translation",
"description": "Image-to-image translation is the task of taking images from one domain and transforming them so they have the style (or characteristics) of images from another domain. (Source: paperswithcode.com)",
"children": [
{
"name": "Unsupervised image-to-image translation",
"description": "Unsupervised image-to-image translation is the task of doing image-to-image translation without ground truth image-to-image pairings. (Source: paperswithcode.com)"
},
{
"name": "Synthetic-to-real translation",
"description": "Synthetic-to-real translation is the task of domain adaptation from synthetic (or virtual) data to real data. (Source: paperswithcode.com)"
},
{
"name": "Multimodal unsupervised image-to-image translation",
"description": "Multimodal unsupervised image-to-image translation is the task of producing multiple translations to one domain from a single image in another domain. (Source: paperswithcode.com)"
},
{
"name": "Cross-view image-to-image translation"
},
{
"name": "Facial makeup transfer",
"description": "Facial makeup transfer aims to translate the **makeup style** from a given *reference* makeup face image to another non-makeup one while *preserving face identity*. (Source: paperswithcode.com)"
},
{
"name": "Fundus to angiography generation",
"description": "Generating Retinal Fluorescein Angiography from Retinal Fundus Image using Generative Adversarial Networks. (Source: paperswithcode.com)"
}
]
},
{
"name": "Conditional image generation",
"description": "Conditional image generation is the task of generating new images from a dataset conditional on their class. (Source: paperswithcode.com)"
},
{
"name": "Text-to-image generation",
"has_input": [
"Text"
],
"has_output": [
"Image"
],
"children": [
{
"name": "Zero-shot text-to-image generation"
}
]
},
{
"name": "Pose transfer"