martinch59609
/
MasterarbeitLatex


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742
							% Encoding: UTF-8
% This file was created with Citavi 6.10.0.73

@article{EncoderDecoder7803544,
  author={Badrinarayanan, Vijay and Kendall, Alex and Cipolla, Roberto},
  journal={IEEE Transactions on Pattern Analysis and Machine Intelligence}, 
  title={SegNet: A Deep Convolutional Encoder-Decoder Architecture for Image Segmentation}, 
  year={2017},
  volume={39},
  number={12},
  pages={2481-2495},
  doi={10.1109/TPAMI.2016.2644615}
}

@misc{Ranftl.3242021,
 author = {Ranftl, Ren{\'e} and Bochkovskiy, Alexey and Koltun, Vladlen},
 year = {2021},
 title = {Vision Transformers for Dense Prediction},
 archivePrefix = {arXiv},
 eprint = {arXiv:2103.13413v1},
 url = {https://arxiv.org/pdf/2103.13413},
 keywords = {Computer Science - Computer Vision and Pattern Recognition;Computer Vision and Pattern Recognition (cs.CV)},
 abstract = {We introduce dense vision transformers, an architecture that leverages vision transformers in place of convolutional networks as a backbone for dense prediction tasks. We assemble tokens from various stages of the vision transformer into image-like representations at various resolutions and progressively combine them into full-resolution predictions using a convolutional decoder. The transformer backbone processes representations at a constant and relatively high resolution and has a global receptive field at every stage. These properties allow the dense vision transformer to provide finer-grained and more globally coherent predictions when compared to fully-convolutional networks. Our experiments show that this architecture yields substantial improvements on dense prediction tasks, especially when a large amount of training data is available. For monocular depth estimation, we observe an improvement of up to 28{\%} in relative performance when compared to a state-of-the-art fully-convolutional network. When applied to semantic segmentation, dense vision transformers set a new state of the art on ADE20K with 49.02{\%} mIoU. We further show that the architecture can be fine-tuned on smaller datasets such as NYUv2, KITTI, and Pascal Context where it also sets the new state of the art. Our models are available at https://github.com/intel-isl/DPT.},
 pagetotal = {15},
 doi = {pages},
 note = {15 pages}
}

@online{NVIDIA.11262021,
 author = {NVIDIA},
 year = {2021},
 title = {NVIDIA Jetson Nano},
 url = {https://www.nvidia.com/de-de/autonomous-machines/embedded-systems/jetson-nano/product-development/},
 keywords = {AIoT;CUDA-X;Edge AI;Internet der Dinge;Jetson Nano;Jetson Nano Developer Kit;KI in der Peripherie;NVIDIA},
 urldate = {2021-12-12},
 abstract = {Das NVIDIA{\circledR} Jetson Nano{\texttrademark}-Entwicklerkit ist ein kleiner, leistungsstarker Computer, der trotz geringem Strombedarf moderne KI-Workloads stemmen kann. Statten Sie Millionen von Ger{\"a}ten in der Peripherie mit unglaublichen neuen Funktionen aus.}
}

@article{Sandler,
 author = {Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh},
 year = {2018},
 title = {MobileNetV2: Inverted Residuals and Linear Bottlenecks},
 archivePrefix = {arXiv},
 eprint = {arXiv:1801.04381v4},
 url = {https://arxiv.org/pdf/1801.04381},
 keywords = {Computer Vision and Pattern Recognition (cs.CV)},
 journaltitle = {The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
 abstract = {In this paper we describe a new mobile architecture, MobileNetV2, that improves the state of the art performance of mobile models on multiple tasks and benchmarks as well as across a spectrum of different model sizes. We also describe efficient ways of applying these mobile models to object detection in a novel framework we call SSDLite. Additionally, we demonstrate how to build mobile semantic segmentation models through a reduced form of DeepLabv3 which we call Mobile DeepLabv3.

The MobileNetV2 architecture is based on an inverted residual structure where the input and output of the residual block are thin bottleneck layers opposite to traditional residual models which use expanded representations in the input an MobileNetV2 uses lightweight depthwise convolutions to filter features in the intermediate expansion layer. Additionally, we find that it is important to remove non-linearities in the narrow layers in order to maintain representational power. We demonstrate that this improves performance and provide an intuition that led to this design. Finally, our approach allows decoupling of the input/output domains from the expressiveness of the transformation, which provides a convenient framework for further analysis. We measure our performance on Imagenet classification, COCO object detection, VOC image segmentation. We evaluate the trade-offs between accuracy, and number of operations measured by multiply-adds (MAdd), as well as the number of parameters},
 file = {MobileNetV2 Inverted Residuals and Linear Bottlenecks:Attachments/MobileNetV2 Inverted Residuals and Linear Bottlenecks.pdf:application/pdf}
}

@online{FlatBuffer.852021,
 year = {2021},
 title = {FlatBuffers: FlatBuffers white paper},
 url = {https://google.github.io/flatbuffers/flatbuffers_white_paper.html},
 urldate = {2021-11-15},
 abstract = {}
}

@online{Colab.1072021,
 year = {2021},
 title = {Google Colaboratory},
 url = {https://colab.research.google.com},
 urldate = {2021-10-07},
 abstract = {}
}

@online{coco.8252021,
 year = {2021},
 title = {COCO - Common Objects in Context},
 url = {https://cocodataset.org},
 urldate = {2021-11-14},
 abstract = {}
}

@online{cocoEval.8252021,
 year = {2021},
 title = {COCO - Common Objects in Context},
 url = {https://cocodataset.org/#detection-eval},
 urldate = {2021-11-14},
 abstract = {}
}

@misc{Aleotti.2020,
 author = {Aleotti, Filippo and Zaccaroni, Giulio and Bartolomei, Luca and Poggi, Matteo and Tosi, Fabio and Mattoccia, Stefano},
 year = {2020},
 title = {Real-time single image depth perception in the wild with handheld devices},
 archivePrefix = {arXiv},
 eprint = {arXiv:2006.05724v1},
 url = {https://arxiv.org/pdf/2006.05724},
 keywords = {Computer Vision and Pattern Recognition (cs.CV);Graphics (cs.GR)},
 abstract = {Depth perception is paramount to tackle real-world problems, ranging from autonomous driving to consumer applications. For the latter, depth estimation from a single image represents the most versatile solution, since a standard camera is available on almost any handheld device. Nonetheless, two main issues limit its practical deployment: i) the low reliability when deployed in-the-wild and ii) the demanding resource requirements to achieve real-time performance, often not compatible with such devices. Therefore, in this paper, we deeply investigate these issues showing how they are both addressable adopting appropriate network design and training strategies -- also outlining how to map the resulting networks on handheld devices to achieve real-time performance. Our thorough evaluation highlights the ability of such fast networks to generalize well to new environments, a crucial feature required to tackle the extremely varied contexts faced in real applications. Indeed, to further support this evidence, we report experimental results concerning real-time depth-aware augmented reality and image blurring with smartphones in-the-wild.},
 file = {Real-time single image depth perception in the wild with:Attachments/Real-time single image depth perception in the wild with.pdf:application/pdf}
}

@article{Arulprakash.2021,
 author = {Arulprakash, Enoch and Aruldoss, Martin},
 year = {2021},
 title = {A study on generic object detection with emphasis on future research directions},
 issn = {13191578},
 journaltitle = {Journal of King Saud University - Computer and Information Sciences},
 doi = {10.1016/j.jksuci.2021.08.001},
 abstract = {},
 note = {PII:  S1319157821002020}
}

@article{Everingham.2010,
 author = {Everingham, Mark and {van Gool}, Luc and Williams, Christopher K. I. and Winn, John and Zisserman, Andrew},
 year = {2010},
 title = {The Pascal Visual Object Classes (VOC) Challenge},
 pages = {303--338},
 pagination = {page},
 volume = {88},
 issn = {0920-5691},
 journaltitle = {International Journal of Computer Vision},
 shortjournal = {Int J Comput Vis},
 doi = {10.1007/s11263-009-0275-4},
 number = {2},
 abstract = {},
 note = {PII:  275}
}

@article{Everingham.2015,
 author = {Everingham, Mark and Eslami, S. M. Ali and {van Gool}, Luc and Williams, Christopher K. I. and Winn, John and Zisserman, Andrew},
 year = {2015},
 title = {The Pascal Visual Object Classes Challenge: A Retrospective},
 pages = {98--136},
 pagination = {page},
 volume = {111},
 issn = {0920-5691},
 journaltitle = {International Journal of Computer Vision},
 shortjournal = {Int J Comput Vis},
 doi = {10.1007/s11263-014-0733-5},
 number = {1},
 abstract = {},
 note = {PII:  733}
}

@misc{EveringhamDevelopmentKit.2012,
  title = {The PASCAL Visual Object Classes Challenge 2012 (VOC2012) Development Kit},
  author = {Everingham, Mark and Winn, John},
  year = {2012},
  url = {http://host.robots.ox.ac.uk/pascal/VOC/voc2012/index.html},
  urldate = {2021-11-24},
}

@article{Hosang.2016,
   title={What Makes for Effective Detection Proposals?},
   volume={38},
   ISSN={2160-9292},
   url={http://dx.doi.org/10.1109/TPAMI.2015.2465908},
   DOI={10.1109/tpami.2015.2465908},
   number={4},
   journal={IEEE Transactions on Pattern Analysis and Machine Intelligence},
   publisher={Institute of Electrical and Electronics Engineers (IEEE)},
   author={Hosang, Jan and Benenson, Rodrigo and Dollar, Piotr and Schiele, Bernt},
   year={2016},
   month={4},
   pages={814–830}
}

@misc{Girshick.2013,
 author = {Girshick, Ross and Donahue, Jeff and Darrell, Trevor and Malik, Jitendra},
 year = {2013},
 title = {Rich feature hierarchies for accurate object detection and semantic segmentation},
 archivePrefix = {arXiv},
 eprint = {arXiv:1311.2524v5},
 url = {https://arxiv.org/pdf/1311.2524},
 keywords = {Computer Vision and Pattern Recognition (cs.CV)},
 abstract = {Object detection performance, as measured on the canonical PASCAL VOC dataset, has plateaued in the last few years. The best-performing methods are complex ensemble systems that typically combine multiple low-level image features with high-level context. In this paper, we propose a simple and scalable detection algorithm that improves mean average precision (mAP) by more than 30{\%} relative to the previous best result on VOC 2012---achieving a mAP of 53.3{\%}. Our approach combines two key insights: (1) one can apply high-capacity convolutional neural networks (CNNs) to bottom-up region proposals in order to localize and segment objects and (2) when labeled training data is scarce, supervised pre-training for an auxiliary task, followed by domain-specific fine-tuning, yields a significant performance boost. Since we combine region proposals with CNNs, we call our method R-CNN: Regions with CNN features. We also compare R-CNN to OverFeat, a recently proposed sliding-window detector based on a similar CNN architecture. We find that R-CNN outperforms OverFeat by a large margin on the 200-class ILSVRC2013 detection dataset. Source code for the complete system is available at this http URL.},
 file = {Rich feature hierarchies for accurate object detection a:Attachments/Rich feature hierarchies for accurate object detection a.pdf:application/pdf}
}

@online{.332021,
 year = {2021},
 title = {MNIST handwritten digit database, Yann LeCun, Corinna Cortes and Chris Burges},
 url = {http://yann.lecun.com/exdb/mnist/},
 urldate = {2021-10-20},
 abstract = {}
}

@online{Tensorflow.8202021,
 author = {TensorFlow},
 year = {2021},
 title = {TensorFlow 2 Schnellstart f{\"u}r Anf{\"a}nger ~|~ TensorFlow Core},
 url = {https://www.tensorflow.org/tutorials/quickstart/beginner?hl=de},
 urldate = {2021-10-20},
 abstract = {}
}


@online{GitHub.1062021,
 author = {GitHub},
 year = {2021},
 title = {models/research/object{\_}detection at master · tensorflow/models},
 url = {https://github.com/tensorflow/models/tree/master/research/object_detection},
 urldate = {2021-10-06},
 abstract = {Models and examples built with TensorFlow. Contribute to tensorflow/models development by creating an account on GitHub.}
}


@online{GitHub.1062021b,
 author = {GitHub},
 year = {2021},
 title = {GitHub - isl-org/MiDaS: Code for robust monocular depth estimation described in Ranftl et. al., Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-shot Cross-dataset Transfer, TPAMI 2020},
 url = {https://github.com/isl-org/MiDaS},
 urldate = {2021-10-06},
 abstract = {Code for robust monocular depth estimation described in Ranftl et. al., Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-shot Cross-dataset Transfer, TPAMI 2020 - GitHub - isl-org/MiDaS: Code for robust monocular depth estimation described in Ranftl et. al., Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-shot Cross-dataset Transfer, TPAMI 2020}
}


@online{GitHub.1062021c,
 author = {GitHub},
 year = {2021},
 title = {GitHub - FilippoAleotti/mobilePydnet: Pydnet on mobile devices},
 url = {https://github.com/FilippoAleotti/mobilePydnet},
 urldate = {2021-10-06},
 abstract = {Pydnet on mobile devices. Contribute to FilippoAleotti/mobilePydnet development by creating an account on GitHub.}
}

@online{GitHubDetectionZoo.11152021,
 author = {GitHub},
 year = {2021},
 title = {models/tf2{\_}detection{\_}zoo.md at master · tensorflow/models},
 url = {https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/tf2_detection_zoo.md},
 urldate = {2021-11-15},
 abstract = {Models and examples built with TensorFlow. Contribute to tensorflow/models development by creating an account on GitHub.}
}


@online{GitHubTensorFlowAndroidExample.11222021,
 author = {GitHub},
 year = {2021},
 title = {examples/lite/examples/image{\_}classification/android at master · tensorflow/examples},
 url = {https://github.com/tensorflow/examples/tree/master/lite/examples/object_detection/android},
 urldate = {2021-11-22},
 abstract = {TensorFlow examples. Contribute to tensorflow/examples development by creating an account on GitHub.}
}


@online{GitHub.1062021d,
 author = {GitHub},
 year = {2021},
 title = {GitHub - tzutalin/labelImg: LabelImg is a graphical image annotation tool and label object bounding boxes in images},
 url = {https://github.com/tzutalin/labelImg},
 urldate = {2021-10-06},
 abstract = {LabelImg is a graphical image annotation tool and label object bounding boxes in images - GitHub - tzutalin/labelImg: LabelImg is a graphical image annotation tool and label object bounding boxes in images}
}

@online{GitHub.1282021,
 author = {GitHub},
 year = {2021},
 title = {GitHub - Genymobile/scrcpy: Display and control your Android device},
 url = {https://github.com/Genymobile/scrcpy},
 urldate = {2021-12-08},
 abstract = {Display and control your Android device. Contribute to Genymobile/scrcpy development by creating an account on GitHub.}
}

@online{GitHub.1282021b,
 author = {GitHub},
 year = {2021},
 title = {GitHub - lutzroeder/netron: Visualizer for neural network, deep learning, and machine learning models},
 url = {https://github.com/lutzroeder/netron},
 urldate = {2021-12-08},
 abstract = {Visualizer for neural network, deep learning, and machine learning models - GitHub - lutzroeder/netron: Visualizer for neural network, deep learning, and machine learning models}
}

@online{GitHub.12122021,
 author = {GitHub},
 year = {2021},
 title = {models/research/object{\_}detection at master · tensorflow/models},
 url = {https://github.com/tensorflow/models/blob/master/research/object_detection/protos/preprocessor.proto},
 urldate = {2021-12-12},
 abstract = {Models and examples built with TensorFlow. Contribute to tensorflow/models development by creating an account on GitHub.}
}

@book{Goodfellow.2018,
 author = {Goodfellow, Ian and Bengio, Yoshua and Courville, Aaron},
 year = {2018},
 title = {Deep Learning},
 url = {https://ebookcentral.proquest.com/lib/kxp/detail.action?docID=5598176},
 keywords = {Electronic books;Machine learning},
 edition = {1st edition},
 publisher = {mitp},
 isbn = {978-3-95845-700-3},
 subtitle = {Das umfassende Handbuch : Grundlagen, aktuelle Verfahren und Algorithmen, neue Forschungsans{\"a}tze},
 language = {ger},
 location = {Frechen},
 series = {mitp Professional},
 abstract = {Intro -- Impressum -- Website zum Buch -- Danksagung -- {\"U}ber die Fachkorrektoren zur deutschen Ausgabe -- Notation -- Einleitung -- F{\"u}r wen ist dieses Buch gedacht? -- Historische Entwicklungen im Deep Learning -- I Angewandte Mathematik und Grundlagen f{\"u}r das Machine Learning -- Lineare Algebra -- Skalare, Vektoren, Matrizen und Tensoren -- Multiplizieren von Matrizen und Vektoren -- Einheits- und Umkehrmatrizen -- Lineare Abh{\"a}ngigkeit und lineare H{\"u}lle -- Normen -- Spezielle Matrizen und Vektoren -- Eigenwertzerlegung -- Singul{\"a}rwertzerlegung -- Die Moore-Penrose-Pseudoinverse -- Der Spuroperator -- Die Determinante -- Beispiel: Hauptkomponentenanalyse -- Wahrscheinlichkeits- und Informationstheorie -- Warum Wahrscheinlichkeit? -- Zufallsvariablen -- Wahrscheinlichkeitsverteilungen -- Randwahrscheinlichkeit -- Bedingte Wahrscheinlichkeit -- Die Produktregel der bedingten Wahrscheinlichkeiten -- Unabh{\"a}ngigkeit und bedingte Unabh{\"a}ngigkeit -- Erwartungswert, Varianz und Kovarianz -- H{\"a}ufig genutzte Wahrscheinlichkeitsverteilungen -- N{\"u}tzliche Eigenschaften h{\"a}ufig verwendeter Funktionen -- Satz von Bayes -- Technische Einzelheiten stetiger Variablen -- Informationstheorie -- Strukturierte probabilistische Modelle -- Numerische Berechnung -- {\"U}berlauf und Unterlauf -- Schlechte Konditionierung -- Optimierung auf Gradientenbasis -- Optimierung unter Nebenbedingungen -- Beispiel: Lineare kleinste Quadrate -- Grundlagen f{\"u}r das Machine Learning -- Lernalgorithmen -- Kapazit{\"a}t, {\"U}beranpassung und Unteranpassung -- Hyperparameter und Validierungsdaten -- Sch{\"a}tzer, Verzerrung und Varianz -- Maximum-Likelihood-Sch{\"a}tzung -- Bayessche Statistik -- Algorithmen f{\"u}r {\"u}berwachtes Lernen -- Algorithmen f{\"u}r un{\"u}berwachtes Lernen -- Stochastisches Gradientenabstiegsverfahren -- Entwickeln eines Machine-Learning-Algorithmus -- Probleme, an denen Deep Learning w{\"a}chst.},
 pagetotal = {883},
 note = {Goodfellow, Ian (VerfasserIn)

Bengio, Yoshua (VerfasserIn)

Courville, Aaron (VerfasserIn)

Lenz, Guido ({\"U}bersetzerIn)}
}

@book{Grus.2020,
 author = {Grus, Joel},
 year = {2020},
 title = {Einf{\"u}hrung in Data Science},
 url = {https://search.ebscohost.com/login.aspx?direct=true&scope=site&db=nlebk&db=nlabk&AN=2298669},
 edition = {2. Auflage},
 publisher = {O'Reilly and dpunkt.verlag},
 isbn = {978-3-96009-123-3},
 subtitle = {Grundprinzipien der Datenanalyse mit Python},
 language = {ger},
 location = {Heidelberg},
 abstract = {},
 note = {Grus, Joel (VerfasserIn)

Rother, Kristian ({\"U}bersetzerIn)

Demmig, Thomas ({\"U}bersetzerIn)}
}

@misc{He.2015,
 author = {He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
 year = {2015},
 title = {Deep Residual Learning for Image Recognition},
 archivePrefix = {arXiv},
 eprint = {arXiv:1512.03385v1},
 url = {https://arxiv.org/pdf/1512.03385},
 keywords = {Computer Vision and Pattern Recognition (cs.CV)},
 abstract = {Deeper neural networks are more difficult to train. We present a residual learning framework to ease the training of networks that are substantially deeper than those used previously. We explicitly reformulate the layers as learning residual functions with reference to the layer inputs, instead of learning unreferenced functions. We provide comprehensive empirical evidence showing that these residual networks are easier to optimize, and can gain accuracy from considerably increased depth. On the ImageNet dataset we evaluate residual nets with a depth of up to 152 layers---8x deeper than VGG nets but still having lower complexity. An ensemble of these residual nets achieves 3.57{\%} error on the ImageNet test set. This result won the 1st place on the ILSVRC 2015 classification task. We also present analysis on CIFAR-10 with 100 and 1000 layers.

The depth of representations is of central importance for many visual recognition tasks. Solely due to our extremely deep representations, we obtain a 28{\%} relative improvement on the COCO object detection dataset. Deep residual nets are foundations of our submissions to ILSVRC {\&} COCO 2015 competitions, where we also won the 1st places on the tasks of ImageNet detection, ImageNet localization, COCO detection, and COCO segmentation.},
 file = {Deep Residual Learning for Image Recognition:Attachments/Deep Residual Learning for Image Recognition.pdf:application/pdf}
}

@misc{HenriqueF.DeArruda.2019,
 author = {{Henrique F. De Arruda} and {Alexandre Benatti} and {C{\'e}sar Henrique Comin} and {Luciano Da F. Costa}},
 year = {2019},
 title = {Learning Deep Learning (CDT-15)},
 publisher = {Unpublished},
 abstract = {},
 doi = {10.13140/RG.2.2.29866.57283},
 language = {en}
}

@misc{Hosang.2017,
 author = {Hosang, Jan and Benenson, Rodrigo and Schiele, Bernt},
 year = {2017},
 title = {Learning non-maximum suppression},
 archivePrefix = {arXiv},
 eprint = {arXiv:1705.02950v2},
 url = {https://arxiv.org/pdf/1705.02950},
 keywords = {Computer Vision and Pattern Recognition (cs.CV)},
 abstract = {Object detectors have hugely profited from moving towards an end-to-end learning paradigm: proposals, features, and the classifier becoming one neural network improved results two-fold on general object detection. One indispensable component is non-maximum suppression (NMS), a post-processing algorithm responsible for merging all detections that belong to the same object. The de facto standard NMS algorithm is still fully hand-crafted, suspiciously simple, and -- being based on greedy clustering with a fixed distance threshold -- forces a trade-off between recall and precision. We propose a new network architecture designed to perform NMS, using only boxes and their score. We report experiments for person detection on PETS and for general object categories on the COCO dataset. Our approach shows promise providing improved localization and occlusion handling.},
 file = {Learning non-maximum suppression:Attachments/Learning non-maximum suppression.pdf:application/pdf}
}

@misc{Jacob.12152017,
 author = {Jacob, Benoit and Kligys, Skirmantas and Chen, Bo and Zhu, Menglong and Tang, Matthew and Howard, Andrew and Adam, Hartwig and Kalenichenko, Dmitry},
 year = {2017},
 title = {Quantization and Training of Neural Networks for Efficient Integer-Arithmetic-Only Inference},
 archivePrefix = {arXiv},
 eprint = {arXiv:1712.05877v1},
 url = {https://arxiv.org/pdf/1712.05877},
 keywords = {Computer Science - Learning;Machine Learning (cs.LG);Machine Learning (stat.ML);Statistics - Machine Learning},
 abstract = {The rising popularity of intelligent mobile devices and the daunting computational cost of deep learning-based models call for efficient and accurate on-device inference schemes. We propose a quantization scheme that allows inference to be carried out using integer-only arithmetic, which can be implemented more efficiently than floating point inference on commonly available integer-only hardware. We also co-design a training procedure to preserve end-to-end model accuracy post quantization. As a result, the proposed quantization scheme improves the tradeoff between accuracy and on-device latency. The improvements are significant even on MobileNets, a model family known for run-time efficiency, and are demonstrated in ImageNet classification and COCO detection on popular CPUs.},
 pagetotal = {14},
 note = {14 pages, 12 figures}
}

@misc{Howard.2017,
 author = {Howard, Andrew G. and Zhu, Menglong and Chen, Bo and Kalenichenko, Dmitry and Wang, Weijun and Weyand, Tobias and Andreetto, Marco and Adam, Hartwig},
 year = {2017},
 title = {MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications},
 archivePrefix = {arXiv},
 eprint = {arXiv:1704.04861v1},
 url = {https://arxiv.org/pdf/1704.04861},
 keywords = {Computer Vision and Pattern Recognition (cs.CV)},
 abstract = {We present a class of efficient models called MobileNets for mobile and embedded vision applications. MobileNets are based on a streamlined architecture that uses depth-wise separable convolutions to build light weight deep neural networks. We introduce two simple global hyper-parameters that efficiently trade off between latency and accuracy. These hyper-parameters allow the model builder to choose the right sized model for their application based on the constraints of the problem. We present extensive experiments on resource and accuracy tradeoffs and show strong performance compared to other popular models on ImageNet classification. We then demonstrate the effectiveness of MobileNets across a wide range of applications and use cases including object detection, finegrain classification, face attributes and large scale geo-localization.},
 file = {MobileNets Efficient Convolutional Neural Networks for M:Attachments/MobileNets Efficient Convolutional Neural Networks for M.pdf:application/pdf}
}

@misc{Ioffe.2015,
 author = {Ioffe, Sergey and Szegedy, Christian},
 year = {2015},
 title = {Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift},
 archivePrefix = {arXiv},
 eprint = {arXiv:1502.03167v3},
 url = {https://arxiv.org/pdf/1502.03167},
 keywords = {Machine Learning (cs.LG)},
 abstract = {Training Deep Neural Networks is complicated by the fact that the distribution of each layer's inputs changes during training, as the parameters of the previous layers change. This slows down the training by requiring lower learning rates and careful parameter initialization, and makes it notoriously hard to train models with saturating nonlinearities. We refer to this phenomenon as internal covariate shift, and address the problem by normalizing layer inputs. Our method draws its strength from making normalization a part of the model architecture and performing the normalization for each training mini-batch. Batch Normalization allows us to use much higher learning rates and be less careful about initialization. It also acts as a regularizer, in some cases eliminating the need for Dropout. Applied to a state-of-the-art image classification model, Batch Normalization achieves the same accuracy with 14 times fewer training steps, and beats the original model by a significant margin. Using an ensemble of batch-normalized networks, we improve upon the best published result on ImageNet classification: reaching 4.9{\%} top-5 validation error (and 4.8{\%} test error), exceeding the accuracy of human raters.}
}

@misc{Krishnamoorthi.2018,
 author = {Krishnamoorthi, Raghuraman},
 year = {2018},
 title = {Quantizing deep convolutional networks for efficient inference: A whitepaper},
 archivePrefix = {arXiv},
 eprint = {arXiv:1806.08342v1},
 url = {https://arxiv.org/pdf/1806.08342},
 keywords = {Computer Vision and Pattern Recognition (cs.CV);Machine Learning (cs.LG);Machine Learning (stat.ML)},
 abstract = {We present an overview of techniques for quantizing convolutional neural networks for inference with integer weights and activations. Per-channel quantization of weights and per-layer quantization of activations to 8-bits of precision post-training produces classification accuracies within 2{\%} of floating point networks for a wide variety of CNN architectures. Model sizes can be reduced by a factor of 4 by quantizing weights to 8-bits, even when 8-bit arithmetic is not supported. This can be achieved with simple, post training quantization of weights.We benchmark latencies of quantized networks on CPUs and DSPs and observe a speedup of 2x-3x for quantized implementations compared to floating point on CPUs. Speedups of up to 10x are observed on specialized processors with fixed point SIMD capabilities, like the Qualcomm QDSPs with HVX.

Quantization-aware training can provide further improvements, reducing the gap to floating point to 1{\%} at 8-bit precision. Quantization-aware training also allows for reducing the precision of weights to four bits with accuracy losses ranging from 2{\%} to 10{\%}, with higher accuracy drop for smaller networks.We introduce tools in TensorFlow and TensorFlowLite for quantizing convolutional networks and review best practices for quantization-aware training to obtain high accuracy with quantized weights and activations. We recommend that per-channel quantization of weights and per-layer quantization of activations be the preferred quantization scheme for hardware acceleration and kernel optimization. We also propose that future processors and hardware accelerators for optimized inference support precisions of 4, 8 and 16 bits.},
 file = {Quantizing deep convolutional networks for efficient inf:Attachments/Quantizing deep convolutional networks for efficient inf.pdf:application/pdf}
}

@misc{Lin.1292016,
 author = {Lin, Tsung-Yi and Doll{\'a}r, Piotr and Girshick, Ross and He, Kaiming and Hariharan, Bharath and Belongie, Serge},
 year = {2016},
 title = {Feature Pyramid Networks for Object Detection},
 archivePrefix = {arXiv},
 eprint = {arXiv:1612.03144v2},
 url = {https://arxiv.org/pdf/1612.03144},
 keywords = {Computer Science - Computer Vision and Pattern Recognition;Computer Vision and Pattern Recognition (cs.CV)},
 abstract = {Feature pyramids are a basic component in recognition systems for detecting objects at different scales. But recent deep learning object detectors have avoided pyramid representations, in part because they are compute and memory intensive. In this paper, we exploit the inherent multi-scale, pyramidal hierarchy of deep convolutional networks to construct feature pyramids with marginal extra cost. A top-down architecture with lateral connections is developed for building high-level semantic feature maps at all scales. This architecture, called a Feature Pyramid Network (FPN), shows significant improvement as a generic feature extractor in several applications. Using FPN in a basic Faster R-CNN system, our method achieves state-of-the-art single-model results on the COCO detection benchmark without bells and whistles, surpassing all existing single-model entries including those from the COCO 2016 challenge winners. In addition, our method can run at 5 FPS on a GPU and thus is a practical and accurate solution to multi-scale object detection. Code will be made publicly available.},
 file = {Feature Pyramid Networks for Object Detection:Attachments/Feature Pyramid Networks for Object Detection.pdf:application/pdf}
}

@misc{Liu.2015,
 author = {Liu, Wei and Anguelov, Dragomir and Erhan, Dumitru and Szegedy, Christian and Reed, Scott and Fu, Cheng-Yang and Berg, Alexander C.},
 year = {2015},
 title = {SSD: Single Shot MultiBox Detector},
 archivePrefix = {arXiv},
 eprint = {arXiv:1512.02325v5},
 url = {https://arxiv.org/pdf/1512.02325},
 keywords = {Computer Vision and Pattern Recognition (cs.CV)},
 abstract = {We present a method for detecting objects in images using a single deep neural network. Our approach, named SSD, discretizes the output space of bounding boxes into a set of default boxes over different aspect ratios and scales per feature map location. At prediction time, the network generates scores for the presence of each object category in each default box and produces adjustments to the box to better match the object shape. Additionally, the network combines predictions from multiple feature maps with different resolutions to naturally handle objects of various sizes. Our SSD model is simple relative to methods that require object proposals because it completely eliminates proposal generation and subsequent pixel or feature resampling stage and encapsulates all computation in a single network. This makes SSD easy to train and straightforward to integrate into systems that require a detection component. Experimental results on the PASCAL VOC, MS COCO, and ILSVRC datasets confirm that SSD has comparable accuracy to methods that utilize an additional object proposal step and is much faster, while providing a unified framework for both training and inference. Compared to other single stage methods, SSD has much better accuracy, even with a smaller input image size. For 

300$\times$300

input, SSD achieves 72.1{\%} mAP on VOC2007 test at 58 FPS on a Nvidia Titan X and for

500$\times$500

input, SSD achieves 75.1{\%} mAP, outperforming a comparable state of the art Faster R-CNN model. Code is available at this https URL .},
 file = {SSD Single Shot MultiBox Detector:Attachments/SSD Single Shot MultiBox Detector.pdf:application/pdf}
}

@misc{Poggi.2018,
 author = {Poggi, Matteo and Aleotti, Filippo and Tosi, Fabio and Mattoccia, Stefano},
 year = {2018},
 title = {Towards real-time unsupervised monocular depth estimation on CPU},
 archivePrefix = {arXiv},
 eprint = {arXiv:1806.11430v3},
 url = {https://arxiv.org/pdf/1806.11430},
 keywords = {Computer Vision and Pattern Recognition (cs.CV);Robotics (cs.RO)},
 abstract = {Unsupervised depth estimation from a single image is a very attractive technique with several implications in robotic, autonomous navigation, augmented reality and so on. This topic represents a very challenging task and the advent of deep learning enabled to tackle this problem with excellent results. However, these architectures are extremely deep and complex. Thus, real-time performance can be achieved only by leveraging power-hungry GPUs that do not allow to infer depth maps in application fields characterized by low-power constraints. To tackle this issue, in this paper we propose a novel architecture capable to quickly infer an accurate depth map on a CPU, even of an embedded system, using a pyramid of features extracted from a single input image. Similarly to state-of-the-art, we train our network in an unsupervised manner casting depth estimation as an image reconstruction problem. Extensive experimental results on the KITTI dataset show that compared to the top performing approach our network has similar accuracy but a much lower complexity (about 6{\%} of parameters) enabling to infer a depth map for a KITTI image in about 1.7 s on the Raspberry Pi 3 and at more than 8 Hz on a standard CPU. Moreover, by trading accuracy for efficiency, our network allows to infer maps at about 2 Hz and 40 Hz respectively, still being more accurate than most state-of-the-art slower methods. To the best of our knowledge, it is the first method enabling such performance on CPUs paving the way for effective deployment of unsupervised monocular depth estimation even on embedded systems.},
 file = {Towards real-time unsupervised monocular depth estimatio:Attachments/Towards real-time unsupervised monocular depth estimatio.pdf:application/pdf}
}

@misc{Ranftl.2019,
 author = {Ranftl, Ren{\'e} and Lasinger, Katrin and Hafner, David and Schindler, Konrad and Koltun, Vladlen},
 year = {2019},
 title = {Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-shot Cross-dataset Transfer},
 archivePrefix = {arXiv},
 eprint = {arXiv:1907.01341v3},
 url = {https://arxiv.org/pdf/1907.01341},
 keywords = {Computer Vision and Pattern Recognition (cs.CV)},
 abstract = {The success of monocular depth estimation relies on large and diverse training sets. Due to the challenges associated with acquiring dense ground-truth depth across different environments at scale, a number of datasets with distinct characteristics and biases have emerged. We develop tools that enable mixing multiple datasets during training, even if their annotations are incompatible. In particular, we propose a robust training objective that is invariant to changes in depth range and scale, advocate the use of principled multi-objective learning to combine data from different sources, and highlight the importance of pretraining encoders on auxiliary tasks. Armed with these tools, we experiment with five diverse training datasets, including a new, massive data source: 3D films. To demonstrate the generalization power of our approach we use zero-shot cross-dataset transfer, i.e. we evaluate on datasets that were not seen during training. The experiments confirm that mixing data from complementary sources greatly improves monocular depth estimation. Our approach clearly outperforms competing methods across diverse datasets, setting a new state of the art for monocular depth estimation. Some results are shown in the supplementary video at this https URL},
 file = {Towards Robust Monocular Depth Estimation Mixing Dataset:Attachments/Towards Robust Monocular Depth Estimation Mixing Dataset.pdf:application/pdf}
}

@book{Raschka.2018,
 author = {Raschka, Sebastian},
 year = {2018},
 title = {Machine Learning mit Python und Scikit-learn und TensorFlow},
 url = {http://www.content-select.com/index.php?id=bib_view&ean=9783958457348},
 edition = {2., aktualisierte und erweiterte Auflage},
 publisher = {mitp},
 isbn = {978-3-95845-733-1},
 subtitle = {Das umfassende Praxis-Handbuch f{\"u}r Data Science, Deep Learning und Predictive Analytics},
 language = {ger},
 location = {Frechen},
 series = {mitp Professional},
 abstract = {},
 pagetotal = {577}
}

@misc{Redmon.2015,
 author = {Redmon, Joseph and Divvala, Santosh and Girshick, Ross and Farhadi, Ali},
 year = {2015},
 title = {You Only Look Once: Unified, Real-Time Object Detection},
 archivePrefix = {arXiv},
 eprint = {arXiv:1506.02640v5},
 url = {https://arxiv.org/pdf/1506.02640},
 keywords = {Computer Vision and Pattern Recognition (cs.CV)},
 abstract = {We present YOLO, a new approach to object detection. Prior work on object detection repurposes classifiers to perform detection. Instead, we frame object detection as a regression problem to spatially separated bounding boxes and associated class probabilities. A single neural network predicts bounding boxes and class probabilities directly from full images in one evaluation. Since the whole detection pipeline is a single network, it can be optimized end-to-end directly on detection performance.

Our unified architecture is extremely fast. Our base YOLO model processes images in real-time at 45 frames per second. A smaller version of the network, Fast YOLO, processes an astounding 155 frames per second while still achieving double the mAP of other real-time detectors. Compared to state-of-the-art detection systems, YOLO makes more localization errors but is far less likely to predict false detections where nothing exists. Finally, YOLO learns very general representations of objects. It outperforms all other detection methods, including DPM and R-CNN, by a wide margin when generalizing from natural images to artwork on both the Picasso Dataset and the People-Art Dataset.},
 file = {You Only Look Once Unified Real-Time Object Detection:Attachments/You Only Look Once Unified Real-Time Object Detection.pdf:application/pdf}
}


@misc{Ren.2015,
 author = {Ren, Shaoqing and He, Kaiming and Girshick, Ross and Sun, Jian},
 year = {2015},
 title = {Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks},
 archivePrefix = {arXiv},
 eprint = {arXiv:1506.01497v3},
 url = {https://arxiv.org/pdf/1506.01497},
 keywords = {Computer Vision and Pattern Recognition (cs.CV)},
 abstract = {State-of-the-art object detection networks depend on region proposal algorithms to hypothesize object locations. Advances like SPPnet and Fast R-CNN have reduced the running time of these detection networks, exposing region proposal computation as a bottleneck. In this work, we introduce a Region Proposal Network (RPN) that shares full-image convolutional features with the detection network, thus enabling nearly cost-free region proposals. An RPN is a fully convolutional network that simultaneously predicts object bounds and objectness scores at each position. The RPN is trained end-to-end to generate high-quality region proposals, which are used by Fast R-CNN for detection. We further merge RPN and Fast R-CNN into a single network by sharing their convolutional features---using the recently popular terminology of neural networks with 'attention' mechanisms, the RPN component tells the unified network where to look. For the very deep VGG-16 model, our detection system has a frame rate of 5fps (including all steps) on a GPU, while achieving state-of-the-art object detection accuracy on PASCAL VOC 2007, 2012, and MS COCO datasets with only 300 proposals per image. In ILSVRC and COCO 2015 competitions, Faster R-CNN and RPN are the foundations of the 1st-place winning entries in several tracks. Code has been made publicly available.},
 file = {Faster R-CNN Towards Real-Time Object Detection with Reg:Attachments/Faster R-CNN Towards Real-Time Object Detection with Reg.pdf:application/pdf}
}

@misc{Rezatofighi.2252019,
 author = {Rezatofighi, Hamid and Tsoi, Nathan and Gwak, JunYoung and Sadeghian, Amir and Reid, Ian and Savarese, Silvio},
 year = {2019},
 title = {Generalized Intersection over Union: A Metric and A Loss for Bounding Box Regression},
 archivePrefix = {arXiv},
 eprint = {arXiv:1902.09630v2},
 url = {https://arxiv.org/pdf/1902.09630},
 keywords = {Artificial Intelligence (cs.AI);Computer Science - Artificial Intelligence;Computer Science - Computer Vision and Pattern Recognition;Computer Science - Learning;Computer Vision and Pattern Recognition (cs.CV);Machine Learning (cs.LG)},
 abstract = {Intersection over Union (IoU) is the most popular evaluation metric used in the object detection benchmarks. However, there is a gap between optimizing the commonly used distance losses for regressing the parameters of a bounding box and maximizing this metric value. The optimal objective for a metric is the metric itself. In the case of axis-aligned 2D bounding boxes, it can be shown that $IoU$ can be directly used as a regression loss. However, $IoU$ has a plateau making it infeasible to optimize in the case of non-overlapping bounding boxes. In this paper, we address the weaknesses of $IoU$ by introducing a generalized version as both a new loss and a new metric. By incorporating this generalized $IoU$ ($GIoU$) as a loss into the state-of-the art object detection frameworks, we show a consistent improvement on their performance using both the standard, $IoU$ based, and new, $GIoU$ based, performance measures on popular object detection benchmarks such as PASCAL VOC and MS COCO.},
 doi = {CVPR},
 file = {Generalized Intersection over Union A Metric and A Loss:Attachments/Generalized Intersection over Union A Metric and A Loss.pdf:application/pdf},
 note = {accepted in CVPR 2019}
}

@online{TensorFlow.8102021,
 author = {TensorFlow},
 year = {2021},
 title = {TensorFlow Lite-Konverter},
 url = {https://www.tensorflow.org/lite/convert},
 urldate = {2021-10-06},
 abstract = {}
}


@online{TensorFlow.8122021,
 author = {TensorFlow},
 year = {2021},
 title = {Quantisierung nach dem Training ~|~ TensorFlow Lite},
 url = {https://www.tensorflow.org/lite/performance/post_training_quantization},
 urldate = {2021-10-06},
 abstract = {}
}


@online{TensorFlow.8202021,
 author = {TensorFlow},
 year = {2021},
 title = {TensorFlow Lite | ML f{\"u}r Mobilger{\"a}te und Edge-Ger{\"a}te},
 url = {https://www.tensorflow.org/lite/?hl=de},
 urldate = {2021-10-06},
 abstract = {Ein Deep-Learning-Framework f{\"u}r ger{\"a}teinterne Inferenz. Trainieren und implementieren Sie Machine-Learning-Modelle auf mobilen und IoT-Ger{\"a}ten, Android, iOS, Edge TPU, Raspberry Pi.}
}


@online{TensorFlow.832021,
 author = {TensorFlow},
 year = {2021},
 title = {TensorFlow},
 url = {https://www.tensorflow.org/},
 urldate = {2021-10-06},
 abstract = {Eine End-to-End-Open-Source-Machine-Learning-Plattform f{\"u}r alle. Entdecken Sie das flexible {\"O}kosystem von TensorFlow aus Tools, Bibliotheken und Community-Ressourcen.}
}

@InProceedings{Xian_2018_CVPR,
author = {Xian, Ke and Shen, Chunhua and Cao, Zhiguo and Lu, Hao and Xiao, Yang and Li, Ruibo and Luo, Zhenbo},
title = {Monocular Relative Depth Perception With Web Stereo Data Supervision},
booktitle = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
month = {6},
year = {2018}
}

@misc{Zhou.182017,
 author = {Zhou, Yiren and Song, Sibo and Cheung, Ngai-Man},
 year = {2017},
 title = {On Classification of Distorted Images with Deep Convolutional Neural Networks},
 archivePrefix = {arXiv},
 eprint = {arXiv:1701.01924v1},
 url = {http://arxiv.org/pdf/1701.01924v1},
 abstract = {Image blur and image noise are common distortions during image acquisition. In this paper, we systematically study the effect of image distortions on the deep neural network (DNN) image classifiers. First, we examine the DNN classifier performance under four types of distortions. Second, we propose two approaches to alleviate the effect of image distortion: re-training and fine-tuning with noisy images. Our results suggest that, under certain conditions, fine-tuning with noisy images can alleviate much effect due to distorted inputs, and is more practical than re-training.},
 pagetotal = {5},
 file = {On_Classification_of_Distorted_Images_with_Deep_Co:Attachments/On_Classification_of_Distorted_Images_with_Deep_Co.pdf:application/pdf},
 note = {5 pages, 8 figures, ICASSP 2017}
}

@book{Zhou.2021,
 author = {Zhou, Zhi-Hua},
 year = {2021},
 title = {Machine Learning},
 publisher = {{Springer Singapore}},
 isbn = {978-981-15-1966-6},
 location = {Singapore},
 abstract = {},
 doi = {10.1007/978-981-15-1967-3},
 file = {Machine Learning:Attachments/Machine Learning.pdf:application/pdf}
}

@InProceedings{PireStereoVision.2012,
author = {Pire, Taihú and De Cristóforis, Pablo and Nitsche, Matias and Berlles, Julio},
year = {2012},
month = {12},
pages = {},
title = {Stereo vision obstacle avoidance using depth and elevation maps}
}

@InProceedings{Zhang.2019,
  author={Zhang, Zhenghong and Xiong, Mingkang and Xiong, Huilin},
  booktitle={2019 4th International Conference on Cloud Computing and Internet of Things (CCIOT)}, 
  title={Monocular Depth Estimation for UAV Obstacle Avoidance}, 
  year={2019},
  volume={},
  number={},
  pages={43-47},
  doi={10.1109/CCIOT48581.2019.8980350}}

@InProceedings{Chakravarty.2017,
  author={Chakravarty, Punarjay and Kelchtermans, Klaas and Roussel, Tom and Wellens, Stijn and Tuytelaars, Tinne and Van Eycken, Luc},
  booktitle={2017 IEEE International Conference on Robotics and Automation (ICRA)}, 
  title={CNN-based single image obstacle avoidance on a quadrotor}, 
  year={2017},
  volume={},
  number={},
  pages={6369-6374},
  doi={10.1109/ICRA.2017.7989752}
}

@InProceedings{Macias-GarciaDetection.2020,
  author={Macias-Garcia, Edgar and Galeana-Perez, Deysy and Bayro-Corrochano, Eduardo},
  booktitle={2020 International Joint Conference on Neural Networks (IJCNN)}, 
  title={CNN Based Perception System for Collision Avoidance in Mobile Robots using Stereo Vision}, 
  year={2020},
  volume={},
  number={},
  pages={1-7},
  doi={10.1109/IJCNN48605.2020.9206747}
}

@InProceedings{Nalpantidis.2009,
  author = {Nalpantidis, Lazaros and Kostavelis, Ioannis and Gasteratos, Antonios},
  year = {2009},
  month = {12},
  pages = {195-204},
  title = {Stereovision-Based Algorithm for Obstacle Avoidance},
  isbn = {978-3-642-10816-7},
  doi = {10.1007/978-3-642-10817-4_19}
}

@inbook{Alvarez.2016,
  author = {Alvarez, H. and Paz, L.M. and Sturm, Jürgen and Cremers, D.},
  year = {2016},
  month = {11},
  pages = {195-209},
  title = {Collision Avoidance for Quadrotors with a Monocular Camera},
  volume = {109},
  isbn = {978-3-319-23777-0},
  doi = {10.1007/978-3-319-23778-7_14}
}

@misc{Hatch.2020,
 author = {Hatch, Kyle and Mern, John and Kochenderfer, Mykel},
 year = {2020},
 title = {Obstacle Avoidance Using a Monocular Camera},
 archivePrefix = {arXiv},
 eprint = {arXiv:2012.01608v2},
 url = {https://arxiv.org/pdf/2012.01608},
 keywords = {Artificial Intelligence (cs.AI);Robotics (cs.RO)},
 abstract = {A collision avoidance system based on simple digital cameras would help enable the safe integration of small UAVs into crowded, low-altitude environments. In this work, we present an obstacle avoidance system for small UAVs that uses a monocular camera with a hybrid neural network and path planner controller. The system is comprised of a vision network for estimating depth from camera images, a high-level control network, a collision prediction network, and a contingency policy. This system is evaluated on a simulated UAV navigating an obstacle course in a constrained flight pattern. Results show the proposed system achieves low collision rates while maintaining operationally relevant flight speeds.},
 file = {Obstacle Avoidance Using a Monocular Camera:Attachments/Obstacle Avoidance Using a Monocular Camera.pdf:application/pdf}
}

@misc{Jacob.12152017,
 author = {Jacob, Benoit and Kligys, Skirmantas and Chen, Bo and Zhu, Menglong and Tang, Matthew and Howard, Andrew and Adam, Hartwig and Kalenichenko, Dmitry},
 year = {2017},
 title = {Quantization and Training of Neural Networks for Efficient Integer-Arithmetic-Only Inference},
 archivePrefix = {arXiv},
 eprint = {arXiv:1712.05877v1},
 url = {https://arxiv.org/pdf/1712.05877},
 keywords = {Computer Science - Learning;Machine Learning (cs.LG);Machine Learning (stat.ML);Statistics - Machine Learning},
 abstract = {The rising popularity of intelligent mobile devices and the daunting computational cost of deep learning-based models call for efficient and accurate on-device inference schemes. We propose a quantization scheme that allows inference to be carried out using integer-only arithmetic, which can be implemented more efficiently than floating point inference on commonly available integer-only hardware. We also co-design a training procedure to preserve end-to-end model accuracy post quantization. As a result, the proposed quantization scheme improves the tradeoff between accuracy and on-device latency. The improvements are significant even on MobileNets, a model family known for run-time efficiency, and are demonstrated in ImageNet classification and COCO detection on popular CPUs.},
 pagetotal = {14},
 note = {14 pages, 12 figures}
}


@misc{Kim.11152015,
 author = {Kim, Dong Ki and Chen, Tsuhan},
 year = {2015},
 title = {Deep Neural Network for Real-Time Autonomous Indoor Navigation},
 archivePrefix = {arXiv},
 eprint = {arXiv:1511.04668v2},
 url = {https://arxiv.org/pdf/1511.04668},
 keywords = {Computer Science - Computer Vision and Pattern Recognition;Computer Vision and Pattern Recognition (cs.CV)},
 abstract = {Autonomous indoor navigation of Micro Aerial Vehicles (MAVs) possesses many challenges. One main reason is that GPS has limited precision in indoor environments. The additional fact that MAVs are not able to carry heavy weight or power consuming sensors, such as range finders, makes indoor autonomous navigation a challenging task. In this paper, we propose a practical system in which a quadcopter autonomously navigates indoors and finds a specific target, i.e., a book bag, by using a single camera. A deep learning model, Convolutional Neural Network (ConvNet), is used to learn a controller strategy that mimics an expert pilot's choice of action. We show our system's performance through real-time experiments in diverse indoor locations. To understand more about our trained network, we use several visualization techniques.},
 file = {Deep Neural Network for Real-Time Autonomous Indoor Navi:Attachments/Deep Neural Network for Real-Time Autonomous Indoor Navi.pdf:application/pdf}
}

@article{Liu.2017,
 author = {Liu, Canglong and Zheng, Bin and Wang, Chunyang and Zhao, Yongting and Fu, Shun and Li, Haochen},
 year = {2017},
 title = {CNN-Based Vision Model for Obstacle Avoidance of Mobile Robot},
 pages = {00007},
 pagination = {page},
 volume = {139},
 journaltitle = {MATEC Web of Conferences},
 shortjournal = {MATEC Web Conf.},
 doi = {10.1051/matecconf/201713900007},
 abstract = {},
 file = {CNN-Based Vision Model for Obstacle Avoidance of Mobile Robot:Attachments/CNN-Based Vision Model for Obstacle Avoidance of Mobile Robot.pdf:application/pdf},
 note = {Xu, Bing (Editor) Chen, Yinong (Editor) PII:  matecconf{\_}icmite2017{\_}00007}
}

@misc{Yang.2017,
 author = {Yang, Shichao and Konam, Sandeep and Ma, Chen and Rosenthal, Stephanie and Veloso, Manuela and Scherer, Sebastian},
 year = {2017},
 title = {Obstacle Avoidance through Deep Networks based Intermediate Perception},
 archivePrefix = {arXiv},
 eprint = {arXiv:1704.08759v1},
 url = {https://arxiv.org/pdf/1704.08759},
 keywords = {Computer Vision and Pattern Recognition (cs.CV);Robotics (cs.RO)},
 abstract = {Obstacle avoidance from monocular images is a challenging problem for robots. Though multi-view structure-from-motion could build 3D maps, it is not robust in textureless environments. Some learning based methods exploit human demonstration to predict a steering command directly from a single image. However, this method is usually biased towards certain tasks or demonstration scenarios and also biased by human understanding. In this paper, we propose a new method to predict a trajectory from images. We train our system on more diverse NYUv2 dataset. The ground truth trajectory is computed from the designed cost functions automatically. The Convolutional Neural Network perception is divided into two stages: first, predict depth map and surface normal from RGB images, which are two important geometric properties related to 3D obstacle representation. Second, predict the trajectory from the depth and normal. Results show that our intermediate perception increases the accuracy by 20{\%} than the direct prediction. Our model generalizes well to other public indoor datasets and is also demonstrated for robot flights in simulation and experiments.},
 file = {Obstacle Avoidance through Deep Networks based Intermedi:Attachments/Obstacle Avoidance through Deep Networks based Intermedi.pdf:application/pdf}
}

@misc{Zhu.2016,
 author = {Zhu, Yuke and Mottaghi, Roozbeh and Kolve, Eric and Lim, Joseph J. and Gupta, Abhinav and Fei-Fei, Li and Farhadi, Ali},
 year = {2016},
 title = {Target-driven Visual Navigation in Indoor Scenes using Deep Reinforcement Learning},
 archivePrefix = {arXiv},
 eprint = {arXiv:1609.05143v1},
 url = {https://arxiv.org/pdf/1609.05143},
 keywords = {Computer Vision and Pattern Recognition (cs.CV)},
 abstract = {Two less addressed issues of deep reinforcement learning are (1) lack of generalization capability to new target goals, and (2) data inefficiency i.e., the model requires several (and often costly) episodes of trial and error to converge, which makes it impractical to be applied to real-world scenarios. In this paper, we address these two issues and apply our model to the task of target-driven visual navigation. To address the first issue, we propose an actor-critic model whose policy is a function of the goal as well as the current state, which allows to better generalize. To address the second issue, we propose AI2-THOR framework, which provides an environment with high-quality 3D scenes and physics engine. Our framework enables agents to take actions and interact with objects. Hence, we can collect a huge number of training samples efficiently.

We show that our proposed method (1) converges faster than the state-of-the-art deep reinforcement learning methods, (2) generalizes across targets and across scenes, (3) generalizes to a real robot scenario with a small amount of fine-tuning (although the model is trained in simulation), (4) is end-to-end trainable and does not need feature engineering, feature matching between frames or 3D reconstruction of the environment.

The supplementary video can be accessed at the following link: this https URL.}
}

@online{OpenImages.6252021,
 year = {2021},
 title = {Open Images V6},
 url = {https://storage.googleapis.com/openimages/web/index.html},
 urldate = {2021-12-12},
 abstract = {}
}

@Comment{jabref-meta: databaseType:biblatex;}