1
2
3
4
5
6
7
8
9#define pr_fmt(fmt) "habanalabs: " fmt
10
11#include "habanalabs.h"
12
13#include <linux/pci.h>
14#include <linux/module.h>
15
16#define HL_DRIVER_AUTHOR "HabanaLabs Kernel Driver Team"
17
18#define HL_DRIVER_DESC "Driver for HabanaLabs's AI Accelerators"
19
20MODULE_AUTHOR(HL_DRIVER_AUTHOR);
21MODULE_DESCRIPTION(HL_DRIVER_DESC);
22MODULE_LICENSE("GPL v2");
23
24static int hl_major;
25static struct class *hl_class;
26static DEFINE_IDR(hl_devs_idr);
27static DEFINE_MUTEX(hl_devs_idr_lock);
28
29static int timeout_locked = 5;
30static int reset_on_lockup = 1;
31
32module_param(timeout_locked, int, 0444);
33MODULE_PARM_DESC(timeout_locked,
34 "Device lockup timeout in seconds (0 = disabled, default 5s)");
35
36module_param(reset_on_lockup, int, 0444);
37MODULE_PARM_DESC(reset_on_lockup,
38 "Do device reset on lockup (0 = no, 1 = yes, default yes)");
39
40#define PCI_VENDOR_ID_HABANALABS 0x1da3
41
42#define PCI_IDS_GOYA 0x0001
43#define PCI_IDS_GAUDI 0x1000
44
45static const struct pci_device_id ids[] = {
46 { PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GOYA), },
47 { PCI_DEVICE(PCI_VENDOR_ID_HABANALABS, PCI_IDS_GAUDI), },
48 { 0, }
49};
50MODULE_DEVICE_TABLE(pci, ids);
51
52
53
54
55
56
57
58
59
60static enum hl_asic_type get_asic_type(u16 device)
61{
62 enum hl_asic_type asic_type;
63
64 switch (device) {
65 case PCI_IDS_GOYA:
66 asic_type = ASIC_GOYA;
67 break;
68 case PCI_IDS_GAUDI:
69 asic_type = ASIC_GAUDI;
70 break;
71 default:
72 asic_type = ASIC_INVALID;
73 break;
74 }
75
76 return asic_type;
77}
78
79
80
81
82
83
84
85
86
87int hl_device_open(struct inode *inode, struct file *filp)
88{
89 struct hl_device *hdev;
90 struct hl_fpriv *hpriv;
91 int rc;
92
93 mutex_lock(&hl_devs_idr_lock);
94 hdev = idr_find(&hl_devs_idr, iminor(inode));
95 mutex_unlock(&hl_devs_idr_lock);
96
97 if (!hdev) {
98 pr_err("Couldn't find device %d:%d\n",
99 imajor(inode), iminor(inode));
100 return -ENXIO;
101 }
102
103 hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
104 if (!hpriv)
105 return -ENOMEM;
106
107 hpriv->hdev = hdev;
108 filp->private_data = hpriv;
109 hpriv->filp = filp;
110 mutex_init(&hpriv->restore_phase_mutex);
111 kref_init(&hpriv->refcount);
112 nonseekable_open(inode, filp);
113
114 hl_cb_mgr_init(&hpriv->cb_mgr);
115 hl_ctx_mgr_init(&hpriv->ctx_mgr);
116
117 hpriv->taskpid = find_get_pid(current->pid);
118
119 mutex_lock(&hdev->fpriv_list_lock);
120
121 if (hl_device_disabled_or_in_reset(hdev)) {
122 dev_err_ratelimited(hdev->dev,
123 "Can't open %s because it is disabled or in reset\n",
124 dev_name(hdev->dev));
125 rc = -EPERM;
126 goto out_err;
127 }
128
129 if (hdev->in_debug) {
130 dev_err_ratelimited(hdev->dev,
131 "Can't open %s because it is being debugged by another user\n",
132 dev_name(hdev->dev));
133 rc = -EPERM;
134 goto out_err;
135 }
136
137 if (hdev->compute_ctx) {
138 dev_dbg_ratelimited(hdev->dev,
139 "Can't open %s because another user is working on it\n",
140 dev_name(hdev->dev));
141 rc = -EBUSY;
142 goto out_err;
143 }
144
145 rc = hl_ctx_create(hdev, hpriv);
146 if (rc) {
147 dev_err(hdev->dev, "Failed to create context %d\n", rc);
148 goto out_err;
149 }
150
151
152
153
154
155
156 hl_device_set_frequency(hdev, PLL_HIGH);
157
158 list_add(&hpriv->dev_node, &hdev->fpriv_list);
159 mutex_unlock(&hdev->fpriv_list_lock);
160
161 hl_debugfs_add_file(hpriv);
162
163 return 0;
164
165out_err:
166 mutex_unlock(&hdev->fpriv_list_lock);
167
168 hl_cb_mgr_fini(hpriv->hdev, &hpriv->cb_mgr);
169 hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
170 filp->private_data = NULL;
171 mutex_destroy(&hpriv->restore_phase_mutex);
172 put_pid(hpriv->taskpid);
173
174 kfree(hpriv);
175
176 return rc;
177}
178
179int hl_device_open_ctrl(struct inode *inode, struct file *filp)
180{
181 struct hl_device *hdev;
182 struct hl_fpriv *hpriv;
183 int rc;
184
185 mutex_lock(&hl_devs_idr_lock);
186 hdev = idr_find(&hl_devs_idr, iminor(inode));
187 mutex_unlock(&hl_devs_idr_lock);
188
189 if (!hdev) {
190 pr_err("Couldn't find device %d:%d\n",
191 imajor(inode), iminor(inode));
192 return -ENXIO;
193 }
194
195 hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL);
196 if (!hpriv)
197 return -ENOMEM;
198
199 mutex_lock(&hdev->fpriv_list_lock);
200
201 if (hl_device_disabled_or_in_reset(hdev)) {
202 dev_err_ratelimited(hdev->dev_ctrl,
203 "Can't open %s because it is disabled or in reset\n",
204 dev_name(hdev->dev_ctrl));
205 rc = -EPERM;
206 goto out_err;
207 }
208
209 list_add(&hpriv->dev_node, &hdev->fpriv_list);
210 mutex_unlock(&hdev->fpriv_list_lock);
211
212 hpriv->hdev = hdev;
213 filp->private_data = hpriv;
214 hpriv->filp = filp;
215 hpriv->is_control = true;
216 nonseekable_open(inode, filp);
217
218 hpriv->taskpid = find_get_pid(current->pid);
219
220 return 0;
221
222out_err:
223 mutex_unlock(&hdev->fpriv_list_lock);
224 kfree(hpriv);
225 return rc;
226}
227
228static void set_driver_behavior_per_device(struct hl_device *hdev)
229{
230 hdev->mmu_enable = 1;
231 hdev->cpu_enable = 1;
232 hdev->fw_loading = 1;
233 hdev->cpu_queues_enable = 1;
234 hdev->heartbeat = 1;
235 hdev->clock_gating_mask = ULONG_MAX;
236
237 hdev->reset_pcilink = 0;
238 hdev->axi_drain = 0;
239 hdev->sram_scrambler_enable = 1;
240 hdev->dram_scrambler_enable = 1;
241 hdev->bmc_enable = 1;
242 hdev->hard_reset_on_fw_events = 1;
243}
244
245
246
247
248
249
250
251
252
253
254
255
256
257int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
258 enum hl_asic_type asic_type, int minor)
259{
260 struct hl_device *hdev;
261 int rc, main_id, ctrl_id = 0;
262
263 *dev = NULL;
264
265 hdev = kzalloc(sizeof(*hdev), GFP_KERNEL);
266 if (!hdev)
267 return -ENOMEM;
268
269
270
271
272 if (pdev) {
273 hdev->asic_type = get_asic_type(pdev->device);
274 if (hdev->asic_type == ASIC_INVALID) {
275 dev_err(&pdev->dev, "Unsupported ASIC\n");
276 rc = -ENODEV;
277 goto free_hdev;
278 }
279 } else {
280 hdev->asic_type = asic_type;
281 }
282
283 hdev->major = hl_major;
284 hdev->reset_on_lockup = reset_on_lockup;
285 hdev->pldm = 0;
286
287 set_driver_behavior_per_device(hdev);
288
289 if (timeout_locked)
290 hdev->timeout_jiffies = msecs_to_jiffies(timeout_locked * 1000);
291 else
292 hdev->timeout_jiffies = MAX_SCHEDULE_TIMEOUT;
293
294 hdev->disabled = true;
295 hdev->pdev = pdev;
296
297
298 hdev->dma_mask = 32;
299
300 mutex_lock(&hl_devs_idr_lock);
301
302
303
304
305 main_id = idr_alloc(&hl_devs_idr, hdev, 0, HL_MAX_MINORS,
306 GFP_KERNEL);
307
308 if (main_id >= 0)
309 ctrl_id = idr_alloc(&hl_devs_idr, hdev, main_id + 1,
310 main_id + 2, GFP_KERNEL);
311
312 mutex_unlock(&hl_devs_idr_lock);
313
314 if ((main_id < 0) || (ctrl_id < 0)) {
315 if ((main_id == -ENOSPC) || (ctrl_id == -ENOSPC))
316 pr_err("too many devices in the system\n");
317
318 if (main_id >= 0) {
319 mutex_lock(&hl_devs_idr_lock);
320 idr_remove(&hl_devs_idr, main_id);
321 mutex_unlock(&hl_devs_idr_lock);
322 }
323
324 rc = -EBUSY;
325 goto free_hdev;
326 }
327
328 hdev->id = main_id;
329 hdev->id_control = ctrl_id;
330
331 *dev = hdev;
332
333 return 0;
334
335free_hdev:
336 kfree(hdev);
337 return rc;
338}
339
340
341
342
343
344
345
346void destroy_hdev(struct hl_device *hdev)
347{
348
349 mutex_lock(&hl_devs_idr_lock);
350 idr_remove(&hl_devs_idr, hdev->id);
351 idr_remove(&hl_devs_idr, hdev->id_control);
352 mutex_unlock(&hl_devs_idr_lock);
353
354 kfree(hdev);
355}
356
357static int hl_pmops_suspend(struct device *dev)
358{
359 struct hl_device *hdev = dev_get_drvdata(dev);
360
361 pr_debug("Going to suspend PCI device\n");
362
363 if (!hdev) {
364 pr_err("device pointer is NULL in suspend\n");
365 return 0;
366 }
367
368 return hl_device_suspend(hdev);
369}
370
371static int hl_pmops_resume(struct device *dev)
372{
373 struct hl_device *hdev = dev_get_drvdata(dev);
374
375 pr_debug("Going to resume PCI device\n");
376
377 if (!hdev) {
378 pr_err("device pointer is NULL in resume\n");
379 return 0;
380 }
381
382 return hl_device_resume(hdev);
383}
384
385
386
387
388
389
390
391
392
393
394
395static int hl_pci_probe(struct pci_dev *pdev,
396 const struct pci_device_id *id)
397{
398 struct hl_device *hdev;
399 int rc;
400
401 dev_info(&pdev->dev, HL_NAME
402 " device found [%04x:%04x] (rev %x)\n",
403 (int)pdev->vendor, (int)pdev->device, (int)pdev->revision);
404
405 rc = create_hdev(&hdev, pdev, ASIC_INVALID, -1);
406 if (rc)
407 return rc;
408
409 pci_set_drvdata(pdev, hdev);
410
411 rc = hl_device_init(hdev, hl_class);
412 if (rc) {
413 dev_err(&pdev->dev, "Fatal error during habanalabs device init\n");
414 rc = -ENODEV;
415 goto disable_device;
416 }
417
418 return 0;
419
420disable_device:
421 pci_set_drvdata(pdev, NULL);
422 destroy_hdev(hdev);
423
424 return rc;
425}
426
427
428
429
430
431
432
433
434static void hl_pci_remove(struct pci_dev *pdev)
435{
436 struct hl_device *hdev;
437
438 hdev = pci_get_drvdata(pdev);
439 if (!hdev)
440 return;
441
442 hl_device_fini(hdev);
443 pci_set_drvdata(pdev, NULL);
444
445 destroy_hdev(hdev);
446}
447
448static const struct dev_pm_ops hl_pm_ops = {
449 .suspend = hl_pmops_suspend,
450 .resume = hl_pmops_resume,
451};
452
453static struct pci_driver hl_pci_driver = {
454 .name = HL_NAME,
455 .id_table = ids,
456 .probe = hl_pci_probe,
457 .remove = hl_pci_remove,
458 .driver.pm = &hl_pm_ops,
459};
460
461
462
463
464static int __init hl_init(void)
465{
466 int rc;
467 dev_t dev;
468
469 pr_info("loading driver\n");
470
471 rc = alloc_chrdev_region(&dev, 0, HL_MAX_MINORS, HL_NAME);
472 if (rc < 0) {
473 pr_err("unable to get major\n");
474 return rc;
475 }
476
477 hl_major = MAJOR(dev);
478
479 hl_class = class_create(THIS_MODULE, HL_NAME);
480 if (IS_ERR(hl_class)) {
481 pr_err("failed to allocate class\n");
482 rc = PTR_ERR(hl_class);
483 goto remove_major;
484 }
485
486 hl_debugfs_init();
487
488 rc = pci_register_driver(&hl_pci_driver);
489 if (rc) {
490 pr_err("failed to register pci device\n");
491 goto remove_debugfs;
492 }
493
494 pr_debug("driver loaded\n");
495
496 return 0;
497
498remove_debugfs:
499 hl_debugfs_fini();
500 class_destroy(hl_class);
501remove_major:
502 unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
503 return rc;
504}
505
506
507
508
509static void __exit hl_exit(void)
510{
511 pci_unregister_driver(&hl_pci_driver);
512
513
514
515
516
517
518 hl_debugfs_fini();
519
520 class_destroy(hl_class);
521 unregister_chrdev_region(MKDEV(hl_major, 0), HL_MAX_MINORS);
522
523 idr_destroy(&hl_devs_idr);
524
525 pr_debug("driver removed\n");
526}
527
528module_init(hl_init);
529module_exit(hl_exit);
530