1
2
3
4
5
6
7
8#define pr_fmt(fmt) "habanalabs: " fmt
9
10#include "habanalabs.h"
11
12#include <linux/pci.h>
13#include <linux/sched/signal.h>
14#include <linux/hwmon.h>
15#include <uapi/misc/habanalabs.h>
16
17#define HL_PLDM_PENDING_RESET_PER_SEC (HL_PENDING_RESET_PER_SEC * 10)
18
19bool hl_device_disabled_or_in_reset(struct hl_device *hdev)
20{
21 if ((hdev->disabled) || (atomic_read(&hdev->in_reset)))
22 return true;
23 else
24 return false;
25}
26
27enum hl_device_status hl_device_status(struct hl_device *hdev)
28{
29 enum hl_device_status status;
30
31 if (hdev->disabled)
32 status = HL_DEVICE_STATUS_MALFUNCTION;
33 else if (atomic_read(&hdev->in_reset))
34 status = HL_DEVICE_STATUS_IN_RESET;
35 else
36 status = HL_DEVICE_STATUS_OPERATIONAL;
37
38 return status;
39}
40
41static void hpriv_release(struct kref *ref)
42{
43 struct hl_fpriv *hpriv;
44 struct hl_device *hdev;
45
46 hpriv = container_of(ref, struct hl_fpriv, refcount);
47
48 hdev = hpriv->hdev;
49
50 put_pid(hpriv->taskpid);
51
52 hl_debugfs_remove_file(hpriv);
53
54 mutex_destroy(&hpriv->restore_phase_mutex);
55
56 mutex_lock(&hdev->fpriv_list_lock);
57 list_del(&hpriv->dev_node);
58 hdev->compute_ctx = NULL;
59 mutex_unlock(&hdev->fpriv_list_lock);
60
61 kfree(hpriv);
62}
63
64void hl_hpriv_get(struct hl_fpriv *hpriv)
65{
66 kref_get(&hpriv->refcount);
67}
68
69void hl_hpriv_put(struct hl_fpriv *hpriv)
70{
71 kref_put(&hpriv->refcount, hpriv_release);
72}
73
74
75
76
77
78
79
80
81
82static int hl_device_release(struct inode *inode, struct file *filp)
83{
84 struct hl_fpriv *hpriv = filp->private_data;
85
86 hl_cb_mgr_fini(hpriv->hdev, &hpriv->cb_mgr);
87 hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
88
89 filp->private_data = NULL;
90
91 hl_hpriv_put(hpriv);
92
93 return 0;
94}
95
96static int hl_device_release_ctrl(struct inode *inode, struct file *filp)
97{
98 struct hl_fpriv *hpriv = filp->private_data;
99 struct hl_device *hdev;
100
101 filp->private_data = NULL;
102
103 hdev = hpriv->hdev;
104
105 mutex_lock(&hdev->fpriv_list_lock);
106 list_del(&hpriv->dev_node);
107 mutex_unlock(&hdev->fpriv_list_lock);
108
109 kfree(hpriv);
110
111 return 0;
112}
113
114
115
116
117
118
119
120
121
122
123static int hl_mmap(struct file *filp, struct vm_area_struct *vma)
124{
125 struct hl_fpriv *hpriv = filp->private_data;
126
127 if ((vma->vm_pgoff & HL_MMAP_CB_MASK) == HL_MMAP_CB_MASK) {
128 vma->vm_pgoff ^= HL_MMAP_CB_MASK;
129 return hl_cb_mmap(hpriv, vma);
130 }
131
132 return -EINVAL;
133}
134
135static const struct file_operations hl_ops = {
136 .owner = THIS_MODULE,
137 .open = hl_device_open,
138 .release = hl_device_release,
139 .mmap = hl_mmap,
140 .unlocked_ioctl = hl_ioctl,
141 .compat_ioctl = hl_ioctl
142};
143
144static const struct file_operations hl_ctrl_ops = {
145 .owner = THIS_MODULE,
146 .open = hl_device_open_ctrl,
147 .release = hl_device_release_ctrl,
148 .unlocked_ioctl = hl_ioctl_control,
149 .compat_ioctl = hl_ioctl_control
150};
151
152static void device_release_func(struct device *dev)
153{
154 kfree(dev);
155}
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170static int device_init_cdev(struct hl_device *hdev, struct class *hclass,
171 int minor, const struct file_operations *fops,
172 char *name, struct cdev *cdev,
173 struct device **dev)
174{
175 cdev_init(cdev, fops);
176 cdev->owner = THIS_MODULE;
177
178 *dev = kzalloc(sizeof(**dev), GFP_KERNEL);
179 if (!*dev)
180 return -ENOMEM;
181
182 device_initialize(*dev);
183 (*dev)->devt = MKDEV(hdev->major, minor);
184 (*dev)->class = hclass;
185 (*dev)->release = device_release_func;
186 dev_set_drvdata(*dev, hdev);
187 dev_set_name(*dev, "%s", name);
188
189 return 0;
190}
191
192static int device_cdev_sysfs_add(struct hl_device *hdev)
193{
194 int rc;
195
196 rc = cdev_device_add(&hdev->cdev, hdev->dev);
197 if (rc) {
198 dev_err(hdev->dev,
199 "failed to add a char device to the system\n");
200 return rc;
201 }
202
203 rc = cdev_device_add(&hdev->cdev_ctrl, hdev->dev_ctrl);
204 if (rc) {
205 dev_err(hdev->dev,
206 "failed to add a control char device to the system\n");
207 goto delete_cdev_device;
208 }
209
210
211 rc = hl_sysfs_init(hdev);
212 if (rc) {
213 dev_err(hdev->dev, "failed to initialize sysfs\n");
214 goto delete_ctrl_cdev_device;
215 }
216
217 hdev->cdev_sysfs_created = true;
218
219 return 0;
220
221delete_ctrl_cdev_device:
222 cdev_device_del(&hdev->cdev_ctrl, hdev->dev_ctrl);
223delete_cdev_device:
224 cdev_device_del(&hdev->cdev, hdev->dev);
225 return rc;
226}
227
228static void device_cdev_sysfs_del(struct hl_device *hdev)
229{
230
231 if (!hdev->cdev_sysfs_created) {
232 kfree(hdev->dev_ctrl);
233 kfree(hdev->dev);
234 return;
235 }
236
237 hl_sysfs_fini(hdev);
238 cdev_device_del(&hdev->cdev_ctrl, hdev->dev_ctrl);
239 cdev_device_del(&hdev->cdev, hdev->dev);
240}
241
242
243
244
245
246
247
248
249
250static int device_early_init(struct hl_device *hdev)
251{
252 int i, rc;
253 char workq_name[32];
254
255 switch (hdev->asic_type) {
256 case ASIC_GOYA:
257 goya_set_asic_funcs(hdev);
258 strlcpy(hdev->asic_name, "GOYA", sizeof(hdev->asic_name));
259 break;
260 case ASIC_GAUDI:
261 gaudi_set_asic_funcs(hdev);
262 sprintf(hdev->asic_name, "GAUDI");
263 break;
264 default:
265 dev_err(hdev->dev, "Unrecognized ASIC type %d\n",
266 hdev->asic_type);
267 return -EINVAL;
268 }
269
270 rc = hdev->asic_funcs->early_init(hdev);
271 if (rc)
272 return rc;
273
274 rc = hl_asid_init(hdev);
275 if (rc)
276 goto early_fini;
277
278 if (hdev->asic_prop.completion_queues_count) {
279 hdev->cq_wq = kcalloc(hdev->asic_prop.completion_queues_count,
280 sizeof(*hdev->cq_wq),
281 GFP_ATOMIC);
282 if (!hdev->cq_wq) {
283 rc = -ENOMEM;
284 goto asid_fini;
285 }
286 }
287
288 for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) {
289 snprintf(workq_name, 32, "hl-free-jobs-%u", i);
290 hdev->cq_wq[i] = create_singlethread_workqueue(workq_name);
291 if (hdev->cq_wq[i] == NULL) {
292 dev_err(hdev->dev, "Failed to allocate CQ workqueue\n");
293 rc = -ENOMEM;
294 goto free_cq_wq;
295 }
296 }
297
298 hdev->eq_wq = alloc_workqueue("hl-events", WQ_UNBOUND, 0);
299 if (hdev->eq_wq == NULL) {
300 dev_err(hdev->dev, "Failed to allocate EQ workqueue\n");
301 rc = -ENOMEM;
302 goto free_cq_wq;
303 }
304
305 hdev->hl_chip_info = kzalloc(sizeof(struct hwmon_chip_info),
306 GFP_KERNEL);
307 if (!hdev->hl_chip_info) {
308 rc = -ENOMEM;
309 goto free_eq_wq;
310 }
311
312 hdev->idle_busy_ts_arr = kmalloc_array(HL_IDLE_BUSY_TS_ARR_SIZE,
313 sizeof(struct hl_device_idle_busy_ts),
314 (GFP_KERNEL | __GFP_ZERO));
315 if (!hdev->idle_busy_ts_arr) {
316 rc = -ENOMEM;
317 goto free_chip_info;
318 }
319
320 hl_cb_mgr_init(&hdev->kernel_cb_mgr);
321
322 mutex_init(&hdev->send_cpu_message_lock);
323 mutex_init(&hdev->debug_lock);
324 mutex_init(&hdev->mmu_cache_lock);
325 INIT_LIST_HEAD(&hdev->hw_queues_mirror_list);
326 spin_lock_init(&hdev->hw_queues_mirror_lock);
327 INIT_LIST_HEAD(&hdev->fpriv_list);
328 mutex_init(&hdev->fpriv_list_lock);
329 atomic_set(&hdev->in_reset, 0);
330
331 return 0;
332
333free_chip_info:
334 kfree(hdev->hl_chip_info);
335free_eq_wq:
336 destroy_workqueue(hdev->eq_wq);
337free_cq_wq:
338 for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
339 if (hdev->cq_wq[i])
340 destroy_workqueue(hdev->cq_wq[i]);
341 kfree(hdev->cq_wq);
342asid_fini:
343 hl_asid_fini(hdev);
344early_fini:
345 if (hdev->asic_funcs->early_fini)
346 hdev->asic_funcs->early_fini(hdev);
347
348 return rc;
349}
350
351
352
353
354
355
356
357static void device_early_fini(struct hl_device *hdev)
358{
359 int i;
360
361 mutex_destroy(&hdev->mmu_cache_lock);
362 mutex_destroy(&hdev->debug_lock);
363 mutex_destroy(&hdev->send_cpu_message_lock);
364
365 mutex_destroy(&hdev->fpriv_list_lock);
366
367 hl_cb_mgr_fini(hdev, &hdev->kernel_cb_mgr);
368
369 kfree(hdev->idle_busy_ts_arr);
370 kfree(hdev->hl_chip_info);
371
372 destroy_workqueue(hdev->eq_wq);
373
374 for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
375 destroy_workqueue(hdev->cq_wq[i]);
376 kfree(hdev->cq_wq);
377
378 hl_asid_fini(hdev);
379
380 if (hdev->asic_funcs->early_fini)
381 hdev->asic_funcs->early_fini(hdev);
382}
383
384static void set_freq_to_low_job(struct work_struct *work)
385{
386 struct hl_device *hdev = container_of(work, struct hl_device,
387 work_freq.work);
388
389 mutex_lock(&hdev->fpriv_list_lock);
390
391 if (!hdev->compute_ctx)
392 hl_device_set_frequency(hdev, PLL_LOW);
393
394 mutex_unlock(&hdev->fpriv_list_lock);
395
396 schedule_delayed_work(&hdev->work_freq,
397 usecs_to_jiffies(HL_PLL_LOW_JOB_FREQ_USEC));
398}
399
400static void hl_device_heartbeat(struct work_struct *work)
401{
402 struct hl_device *hdev = container_of(work, struct hl_device,
403 work_heartbeat.work);
404
405 if (hl_device_disabled_or_in_reset(hdev))
406 goto reschedule;
407
408 if (!hdev->asic_funcs->send_heartbeat(hdev))
409 goto reschedule;
410
411 dev_err(hdev->dev, "Device heartbeat failed!\n");
412 hl_device_reset(hdev, true, false);
413
414 return;
415
416reschedule:
417 schedule_delayed_work(&hdev->work_heartbeat,
418 usecs_to_jiffies(HL_HEARTBEAT_PER_USEC));
419}
420
421
422
423
424
425
426
427
428
429static int device_late_init(struct hl_device *hdev)
430{
431 int rc;
432
433 if (hdev->asic_funcs->late_init) {
434 rc = hdev->asic_funcs->late_init(hdev);
435 if (rc) {
436 dev_err(hdev->dev,
437 "failed late initialization for the H/W\n");
438 return rc;
439 }
440 }
441
442 hdev->high_pll = hdev->asic_prop.high_pll;
443
444
445 hdev->curr_pll_profile = PLL_LOW;
446
447 if (hdev->pm_mng_profile == PM_AUTO)
448 hdev->asic_funcs->set_pll_profile(hdev, PLL_LOW);
449 else
450 hdev->asic_funcs->set_pll_profile(hdev, PLL_LAST);
451
452 INIT_DELAYED_WORK(&hdev->work_freq, set_freq_to_low_job);
453 schedule_delayed_work(&hdev->work_freq,
454 usecs_to_jiffies(HL_PLL_LOW_JOB_FREQ_USEC));
455
456 if (hdev->heartbeat) {
457 INIT_DELAYED_WORK(&hdev->work_heartbeat, hl_device_heartbeat);
458 schedule_delayed_work(&hdev->work_heartbeat,
459 usecs_to_jiffies(HL_HEARTBEAT_PER_USEC));
460 }
461
462 hdev->late_init_done = true;
463
464 return 0;
465}
466
467
468
469
470
471
472
473static void device_late_fini(struct hl_device *hdev)
474{
475 if (!hdev->late_init_done)
476 return;
477
478 cancel_delayed_work_sync(&hdev->work_freq);
479 if (hdev->heartbeat)
480 cancel_delayed_work_sync(&hdev->work_heartbeat);
481
482 if (hdev->asic_funcs->late_fini)
483 hdev->asic_funcs->late_fini(hdev);
484
485 hdev->late_init_done = false;
486}
487
488uint32_t hl_device_utilization(struct hl_device *hdev, uint32_t period_ms)
489{
490 struct hl_device_idle_busy_ts *ts;
491 ktime_t zero_ktime, curr = ktime_get();
492 u32 overlap_cnt = 0, last_index = hdev->idle_busy_ts_idx;
493 s64 period_us, last_start_us, last_end_us, last_busy_time_us,
494 total_busy_time_us = 0, total_busy_time_ms;
495
496 zero_ktime = ktime_set(0, 0);
497 period_us = period_ms * USEC_PER_MSEC;
498 ts = &hdev->idle_busy_ts_arr[last_index];
499
500
501 if (!ktime_compare(ts->busy_to_idle_ts, zero_ktime) &&
502 !ktime_compare(ts->idle_to_busy_ts, zero_ktime)) {
503
504 last_index--;
505
506 if (last_index > HL_IDLE_BUSY_TS_ARR_SIZE)
507 last_index = HL_IDLE_BUSY_TS_ARR_SIZE - 1;
508
509 ts = &hdev->idle_busy_ts_arr[last_index];
510 }
511
512 while (overlap_cnt < HL_IDLE_BUSY_TS_ARR_SIZE) {
513
514
515
516
517 last_start_us = ktime_to_us(
518 ktime_sub(curr, ts->idle_to_busy_ts));
519
520 if (last_start_us > period_us) {
521
522
523
524
525
526
527
528 if (!ktime_compare(ts->busy_to_idle_ts, zero_ktime)) {
529
530 if (ktime_compare(ts->idle_to_busy_ts,
531 zero_ktime))
532 return 100;
533
534
535
536
537
538 break;
539 }
540
541
542 last_end_us = ktime_to_us(
543 ktime_sub(curr, ts->busy_to_idle_ts));
544
545 if (last_end_us > period_us)
546 break;
547
548
549 last_busy_time_us = ktime_to_us(
550 ktime_sub(ts->busy_to_idle_ts,
551 ts->idle_to_busy_ts));
552 total_busy_time_us += last_busy_time_us -
553 (last_start_us - period_us);
554 break;
555 }
556
557
558 if (ktime_compare(ts->busy_to_idle_ts, zero_ktime))
559 last_busy_time_us = ktime_to_us(
560 ktime_sub(ts->busy_to_idle_ts,
561 ts->idle_to_busy_ts));
562 else
563 last_busy_time_us = ktime_to_us(
564 ktime_sub(curr, ts->idle_to_busy_ts));
565
566 total_busy_time_us += last_busy_time_us;
567
568 last_index--;
569
570 if (last_index > HL_IDLE_BUSY_TS_ARR_SIZE)
571 last_index = HL_IDLE_BUSY_TS_ARR_SIZE - 1;
572
573 ts = &hdev->idle_busy_ts_arr[last_index];
574
575 overlap_cnt++;
576 }
577
578 total_busy_time_ms = DIV_ROUND_UP_ULL(total_busy_time_us,
579 USEC_PER_MSEC);
580
581 return DIV_ROUND_UP_ULL(total_busy_time_ms * 100, period_ms);
582}
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597int hl_device_set_frequency(struct hl_device *hdev, enum hl_pll_frequency freq)
598{
599 if ((hdev->pm_mng_profile == PM_MANUAL) ||
600 (hdev->curr_pll_profile == freq))
601 return 0;
602
603 dev_dbg(hdev->dev, "Changing device frequency to %s\n",
604 freq == PLL_HIGH ? "high" : "low");
605
606 hdev->asic_funcs->set_pll_profile(hdev, freq);
607
608 hdev->curr_pll_profile = freq;
609
610 return 1;
611}
612
613int hl_device_set_debug_mode(struct hl_device *hdev, bool enable)
614{
615 int rc = 0;
616
617 mutex_lock(&hdev->debug_lock);
618
619 if (!enable) {
620 if (!hdev->in_debug) {
621 dev_err(hdev->dev,
622 "Failed to disable debug mode because device was not in debug mode\n");
623 rc = -EFAULT;
624 goto out;
625 }
626
627 if (!hdev->hard_reset_pending)
628 hdev->asic_funcs->halt_coresight(hdev);
629
630 hdev->in_debug = 0;
631
632 if (!hdev->hard_reset_pending)
633 hdev->asic_funcs->set_clock_gating(hdev);
634
635 goto out;
636 }
637
638 if (hdev->in_debug) {
639 dev_err(hdev->dev,
640 "Failed to enable debug mode because device is already in debug mode\n");
641 rc = -EFAULT;
642 goto out;
643 }
644
645 hdev->asic_funcs->disable_clock_gating(hdev);
646 hdev->in_debug = 1;
647
648out:
649 mutex_unlock(&hdev->debug_lock);
650
651 return rc;
652}
653
654
655
656
657
658
659
660
661
662
663int hl_device_suspend(struct hl_device *hdev)
664{
665 int rc;
666
667 pci_save_state(hdev->pdev);
668
669
670 rc = atomic_cmpxchg(&hdev->in_reset, 0, 1);
671 if (rc) {
672 dev_err(hdev->dev, "Can't suspend while in reset\n");
673 return -EIO;
674 }
675
676
677 hdev->disabled = true;
678
679
680
681
682
683 hdev->asic_funcs->hw_queues_lock(hdev);
684 hdev->asic_funcs->hw_queues_unlock(hdev);
685
686
687 mutex_lock(&hdev->send_cpu_message_lock);
688 mutex_unlock(&hdev->send_cpu_message_lock);
689
690 rc = hdev->asic_funcs->suspend(hdev);
691 if (rc)
692 dev_err(hdev->dev,
693 "Failed to disable PCI access of device CPU\n");
694
695
696 pci_disable_device(hdev->pdev);
697 pci_set_power_state(hdev->pdev, PCI_D3hot);
698
699 return 0;
700}
701
702
703
704
705
706
707
708
709
710
711int hl_device_resume(struct hl_device *hdev)
712{
713 int rc;
714
715 pci_set_power_state(hdev->pdev, PCI_D0);
716 pci_restore_state(hdev->pdev);
717 rc = pci_enable_device_mem(hdev->pdev);
718 if (rc) {
719 dev_err(hdev->dev,
720 "Failed to enable PCI device in resume\n");
721 return rc;
722 }
723
724 pci_set_master(hdev->pdev);
725
726 rc = hdev->asic_funcs->resume(hdev);
727 if (rc) {
728 dev_err(hdev->dev, "Failed to resume device after suspend\n");
729 goto disable_device;
730 }
731
732
733 hdev->disabled = false;
734 atomic_set(&hdev->in_reset, 0);
735
736 rc = hl_device_reset(hdev, true, false);
737 if (rc) {
738 dev_err(hdev->dev, "Failed to reset device during resume\n");
739 goto disable_device;
740 }
741
742 return 0;
743
744disable_device:
745 pci_clear_master(hdev->pdev);
746 pci_disable_device(hdev->pdev);
747
748 return rc;
749}
750
751static int device_kill_open_processes(struct hl_device *hdev)
752{
753 u16 pending_total, pending_cnt;
754 struct hl_fpriv *hpriv;
755 struct task_struct *task = NULL;
756
757 if (hdev->pldm)
758 pending_total = HL_PLDM_PENDING_RESET_PER_SEC;
759 else
760 pending_total = HL_PENDING_RESET_PER_SEC;
761
762
763
764
765 if (!list_empty(&hdev->fpriv_list))
766 ssleep(1);
767
768 mutex_lock(&hdev->fpriv_list_lock);
769
770
771
772
773 list_for_each_entry(hpriv, &hdev->fpriv_list, dev_node) {
774 task = get_pid_task(hpriv->taskpid, PIDTYPE_PID);
775 if (task) {
776 dev_info(hdev->dev, "Killing user process pid=%d\n",
777 task_pid_nr(task));
778 send_sig(SIGKILL, task, 1);
779 usleep_range(1000, 10000);
780
781 put_task_struct(task);
782 }
783 }
784
785 mutex_unlock(&hdev->fpriv_list_lock);
786
787
788
789
790
791
792
793 pending_cnt = pending_total;
794
795 while ((!list_empty(&hdev->fpriv_list)) && (pending_cnt)) {
796 dev_info(hdev->dev,
797 "Waiting for all unmap operations to finish before hard reset\n");
798
799 pending_cnt--;
800
801 ssleep(1);
802 }
803
804 return list_empty(&hdev->fpriv_list) ? 0 : -EBUSY;
805}
806
807static void device_hard_reset_pending(struct work_struct *work)
808{
809 struct hl_device_reset_work *device_reset_work =
810 container_of(work, struct hl_device_reset_work, reset_work);
811 struct hl_device *hdev = device_reset_work->hdev;
812
813 hl_device_reset(hdev, true, true);
814
815 kfree(device_reset_work);
816}
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836int hl_device_reset(struct hl_device *hdev, bool hard_reset,
837 bool from_hard_reset_thread)
838{
839 int i, rc;
840
841 if (!hdev->init_done) {
842 dev_err(hdev->dev,
843 "Can't reset before initialization is done\n");
844 return 0;
845 }
846
847 if ((!hard_reset) && (!hdev->supports_soft_reset)) {
848 dev_dbg(hdev->dev, "Doing hard-reset instead of soft-reset\n");
849 hard_reset = true;
850 }
851
852
853
854
855
856
857 if (!from_hard_reset_thread) {
858
859 rc = atomic_cmpxchg(&hdev->in_reset, 0, 1);
860 if (rc)
861 return 0;
862
863 if (hard_reset) {
864
865
866
867
868
869
870
871
872
873 if (hl_fw_send_pci_access_msg(hdev,
874 ARMCP_PACKET_DISABLE_PCI_ACCESS))
875 dev_warn(hdev->dev,
876 "Failed to disable PCI access by F/W\n");
877 }
878
879
880 hdev->disabled = true;
881
882
883
884
885 hdev->asic_funcs->hw_queues_lock(hdev);
886 hdev->asic_funcs->hw_queues_unlock(hdev);
887
888
889 mutex_lock(&hdev->fpriv_list_lock);
890 mutex_unlock(&hdev->fpriv_list_lock);
891
892 dev_err(hdev->dev, "Going to RESET device!\n");
893 }
894
895again:
896 if ((hard_reset) && (!from_hard_reset_thread)) {
897 struct hl_device_reset_work *device_reset_work;
898
899 hdev->hard_reset_pending = true;
900
901 device_reset_work = kzalloc(sizeof(*device_reset_work),
902 GFP_ATOMIC);
903 if (!device_reset_work) {
904 rc = -ENOMEM;
905 goto out_err;
906 }
907
908
909
910
911
912
913 INIT_WORK(&device_reset_work->reset_work,
914 device_hard_reset_pending);
915 device_reset_work->hdev = hdev;
916 schedule_work(&device_reset_work->reset_work);
917
918 return 0;
919 }
920
921 if (hard_reset) {
922 device_late_fini(hdev);
923
924
925
926
927
928 mutex_lock(&hdev->send_cpu_message_lock);
929 mutex_unlock(&hdev->send_cpu_message_lock);
930 }
931
932
933
934
935
936
937 hdev->asic_funcs->halt_engines(hdev, hard_reset);
938
939
940 hl_cs_rollback_all(hdev);
941
942 if (hard_reset) {
943
944
945
946
947 rc = device_kill_open_processes(hdev);
948 if (rc) {
949 dev_crit(hdev->dev,
950 "Failed to kill all open processes, stopping hard reset\n");
951 goto out_err;
952 }
953
954
955
956
957 flush_workqueue(hdev->eq_wq);
958 }
959
960
961 if ((hard_reset) && (hl_ctx_put(hdev->kernel_ctx) == 1))
962 hdev->kernel_ctx = NULL;
963
964
965 hdev->asic_funcs->hw_fini(hdev, hard_reset);
966
967 if (hard_reset) {
968 hl_vm_fini(hdev);
969 hl_mmu_fini(hdev);
970 hl_eq_reset(hdev, &hdev->event_queue);
971 }
972
973
974 hl_hw_queue_reset(hdev, hard_reset);
975 for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
976 hl_cq_reset(hdev, &hdev->completion_queue[i]);
977
978 hdev->idle_busy_ts_idx = 0;
979 hdev->idle_busy_ts_arr[0].busy_to_idle_ts = ktime_set(0, 0);
980 hdev->idle_busy_ts_arr[0].idle_to_busy_ts = ktime_set(0, 0);
981
982 if (hdev->cs_active_cnt)
983 dev_crit(hdev->dev, "CS active cnt %d is not 0 during reset\n",
984 hdev->cs_active_cnt);
985
986 mutex_lock(&hdev->fpriv_list_lock);
987
988
989 if (hdev->compute_ctx) {
990 atomic_set(&hdev->compute_ctx->thread_ctx_switch_token, 1);
991 hdev->compute_ctx->thread_ctx_switch_wait_token = 0;
992 }
993
994 mutex_unlock(&hdev->fpriv_list_lock);
995
996
997
998 if (hard_reset) {
999 hdev->device_cpu_disabled = false;
1000 hdev->hard_reset_pending = false;
1001
1002 if (hdev->kernel_ctx) {
1003 dev_crit(hdev->dev,
1004 "kernel ctx was alive during hard reset, something is terribly wrong\n");
1005 rc = -EBUSY;
1006 goto out_err;
1007 }
1008
1009 rc = hl_mmu_init(hdev);
1010 if (rc) {
1011 dev_err(hdev->dev,
1012 "Failed to initialize MMU S/W after hard reset\n");
1013 goto out_err;
1014 }
1015
1016
1017 hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx),
1018 GFP_KERNEL);
1019 if (!hdev->kernel_ctx) {
1020 rc = -ENOMEM;
1021 goto out_err;
1022 }
1023
1024 hdev->compute_ctx = NULL;
1025
1026 rc = hl_ctx_init(hdev, hdev->kernel_ctx, true);
1027 if (rc) {
1028 dev_err(hdev->dev,
1029 "failed to init kernel ctx in hard reset\n");
1030 kfree(hdev->kernel_ctx);
1031 hdev->kernel_ctx = NULL;
1032 goto out_err;
1033 }
1034 }
1035
1036
1037
1038
1039
1040 hdev->disabled = false;
1041
1042 rc = hdev->asic_funcs->hw_init(hdev);
1043 if (rc) {
1044 dev_err(hdev->dev,
1045 "failed to initialize the H/W after reset\n");
1046 goto out_err;
1047 }
1048
1049
1050 rc = hdev->asic_funcs->test_queues(hdev);
1051 if (rc) {
1052 dev_err(hdev->dev,
1053 "Failed to detect if device is alive after reset\n");
1054 goto out_err;
1055 }
1056
1057 if (hard_reset) {
1058 rc = device_late_init(hdev);
1059 if (rc) {
1060 dev_err(hdev->dev,
1061 "Failed late init after hard reset\n");
1062 goto out_err;
1063 }
1064
1065 rc = hl_vm_init(hdev);
1066 if (rc) {
1067 dev_err(hdev->dev,
1068 "Failed to init memory module after hard reset\n");
1069 goto out_err;
1070 }
1071
1072 hl_set_max_power(hdev);
1073 } else {
1074 rc = hdev->asic_funcs->soft_reset_late_init(hdev);
1075 if (rc) {
1076 dev_err(hdev->dev,
1077 "Failed late init after soft reset\n");
1078 goto out_err;
1079 }
1080 }
1081
1082 atomic_set(&hdev->in_reset, 0);
1083
1084 if (hard_reset)
1085 hdev->hard_reset_cnt++;
1086 else
1087 hdev->soft_reset_cnt++;
1088
1089 dev_warn(hdev->dev, "Successfully finished resetting the device\n");
1090
1091 return 0;
1092
1093out_err:
1094 hdev->disabled = true;
1095
1096 if (hard_reset) {
1097 dev_err(hdev->dev,
1098 "Failed to reset! Device is NOT usable\n");
1099 hdev->hard_reset_cnt++;
1100 } else {
1101 dev_err(hdev->dev,
1102 "Failed to do soft-reset, trying hard reset\n");
1103 hdev->soft_reset_cnt++;
1104 hard_reset = true;
1105 goto again;
1106 }
1107
1108 atomic_set(&hdev->in_reset, 0);
1109
1110 return rc;
1111}
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122int hl_device_init(struct hl_device *hdev, struct class *hclass)
1123{
1124 int i, rc, cq_cnt, cq_ready_cnt;
1125 char *name;
1126 bool add_cdev_sysfs_on_err = false;
1127
1128 name = kasprintf(GFP_KERNEL, "hl%d", hdev->id / 2);
1129 if (!name) {
1130 rc = -ENOMEM;
1131 goto out_disabled;
1132 }
1133
1134
1135 rc = device_init_cdev(hdev, hclass, hdev->id, &hl_ops, name,
1136 &hdev->cdev, &hdev->dev);
1137
1138 kfree(name);
1139
1140 if (rc)
1141 goto out_disabled;
1142
1143 name = kasprintf(GFP_KERNEL, "hl_controlD%d", hdev->id / 2);
1144 if (!name) {
1145 rc = -ENOMEM;
1146 goto free_dev;
1147 }
1148
1149
1150 rc = device_init_cdev(hdev, hclass, hdev->id_control, &hl_ctrl_ops,
1151 name, &hdev->cdev_ctrl, &hdev->dev_ctrl);
1152
1153 kfree(name);
1154
1155 if (rc)
1156 goto free_dev;
1157
1158
1159 rc = device_early_init(hdev);
1160 if (rc)
1161 goto free_dev_ctrl;
1162
1163
1164
1165
1166
1167 rc = hdev->asic_funcs->sw_init(hdev);
1168 if (rc)
1169 goto early_fini;
1170
1171
1172
1173
1174
1175
1176 rc = hl_hw_queues_create(hdev);
1177 if (rc) {
1178 dev_err(hdev->dev, "failed to initialize kernel queues\n");
1179 goto sw_fini;
1180 }
1181
1182 cq_cnt = hdev->asic_prop.completion_queues_count;
1183
1184
1185
1186
1187
1188
1189 if (cq_cnt) {
1190 hdev->completion_queue = kcalloc(cq_cnt,
1191 sizeof(*hdev->completion_queue),
1192 GFP_KERNEL);
1193
1194 if (!hdev->completion_queue) {
1195 dev_err(hdev->dev,
1196 "failed to allocate completion queues\n");
1197 rc = -ENOMEM;
1198 goto hw_queues_destroy;
1199 }
1200 }
1201
1202 for (i = 0, cq_ready_cnt = 0 ; i < cq_cnt ; i++, cq_ready_cnt++) {
1203 rc = hl_cq_init(hdev, &hdev->completion_queue[i],
1204 hdev->asic_funcs->get_queue_id_for_cq(hdev, i));
1205 if (rc) {
1206 dev_err(hdev->dev,
1207 "failed to initialize completion queue\n");
1208 goto cq_fini;
1209 }
1210 hdev->completion_queue[i].cq_idx = i;
1211 }
1212
1213
1214
1215
1216
1217
1218 rc = hl_eq_init(hdev, &hdev->event_queue);
1219 if (rc) {
1220 dev_err(hdev->dev, "failed to initialize event queue\n");
1221 goto cq_fini;
1222 }
1223
1224
1225 rc = hl_mmu_init(hdev);
1226 if (rc) {
1227 dev_err(hdev->dev, "Failed to initialize MMU S/W structures\n");
1228 goto eq_fini;
1229 }
1230
1231
1232 hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx), GFP_KERNEL);
1233 if (!hdev->kernel_ctx) {
1234 rc = -ENOMEM;
1235 goto mmu_fini;
1236 }
1237
1238 hdev->compute_ctx = NULL;
1239
1240 rc = hl_ctx_init(hdev, hdev->kernel_ctx, true);
1241 if (rc) {
1242 dev_err(hdev->dev, "failed to initialize kernel context\n");
1243 kfree(hdev->kernel_ctx);
1244 goto mmu_fini;
1245 }
1246
1247 rc = hl_cb_pool_init(hdev);
1248 if (rc) {
1249 dev_err(hdev->dev, "failed to initialize CB pool\n");
1250 goto release_ctx;
1251 }
1252
1253 hl_debugfs_add_device(hdev);
1254
1255 if (hdev->asic_funcs->get_hw_state(hdev) == HL_DEVICE_HW_STATE_DIRTY) {
1256 dev_info(hdev->dev,
1257 "H/W state is dirty, must reset before initializing\n");
1258 hdev->asic_funcs->halt_engines(hdev, true);
1259 hdev->asic_funcs->hw_fini(hdev, true);
1260 }
1261
1262
1263
1264
1265
1266 add_cdev_sysfs_on_err = true;
1267
1268
1269
1270
1271
1272 hdev->disabled = false;
1273
1274 rc = hdev->asic_funcs->hw_init(hdev);
1275 if (rc) {
1276 dev_err(hdev->dev, "failed to initialize the H/W\n");
1277 rc = 0;
1278 goto out_disabled;
1279 }
1280
1281
1282 rc = hdev->asic_funcs->test_queues(hdev);
1283 if (rc) {
1284 dev_err(hdev->dev, "Failed to detect if device is alive\n");
1285 rc = 0;
1286 goto out_disabled;
1287 }
1288
1289 rc = device_late_init(hdev);
1290 if (rc) {
1291 dev_err(hdev->dev, "Failed late initialization\n");
1292 rc = 0;
1293 goto out_disabled;
1294 }
1295
1296 dev_info(hdev->dev, "Found %s device with %lluGB DRAM\n",
1297 hdev->asic_name,
1298 hdev->asic_prop.dram_size / 1024 / 1024 / 1024);
1299
1300 rc = hl_vm_init(hdev);
1301 if (rc) {
1302 dev_err(hdev->dev, "Failed to initialize memory module\n");
1303 rc = 0;
1304 goto out_disabled;
1305 }
1306
1307
1308
1309
1310
1311
1312 add_cdev_sysfs_on_err = false;
1313 rc = device_cdev_sysfs_add(hdev);
1314 if (rc) {
1315 dev_err(hdev->dev,
1316 "Failed to add char devices and sysfs nodes\n");
1317 rc = 0;
1318 goto out_disabled;
1319 }
1320
1321
1322
1323
1324 hl_set_max_power(hdev);
1325
1326
1327
1328
1329
1330
1331
1332 rc = hl_hwmon_init(hdev);
1333 if (rc) {
1334 dev_err(hdev->dev, "Failed to initialize hwmon\n");
1335 rc = 0;
1336 goto out_disabled;
1337 }
1338
1339 dev_notice(hdev->dev,
1340 "Successfully added device to habanalabs driver\n");
1341
1342 hdev->init_done = true;
1343
1344 return 0;
1345
1346release_ctx:
1347 if (hl_ctx_put(hdev->kernel_ctx) != 1)
1348 dev_err(hdev->dev,
1349 "kernel ctx is still alive on initialization failure\n");
1350mmu_fini:
1351 hl_mmu_fini(hdev);
1352eq_fini:
1353 hl_eq_fini(hdev, &hdev->event_queue);
1354cq_fini:
1355 for (i = 0 ; i < cq_ready_cnt ; i++)
1356 hl_cq_fini(hdev, &hdev->completion_queue[i]);
1357 kfree(hdev->completion_queue);
1358hw_queues_destroy:
1359 hl_hw_queues_destroy(hdev);
1360sw_fini:
1361 hdev->asic_funcs->sw_fini(hdev);
1362early_fini:
1363 device_early_fini(hdev);
1364free_dev_ctrl:
1365 kfree(hdev->dev_ctrl);
1366free_dev:
1367 kfree(hdev->dev);
1368out_disabled:
1369 hdev->disabled = true;
1370 if (add_cdev_sysfs_on_err)
1371 device_cdev_sysfs_add(hdev);
1372 if (hdev->pdev)
1373 dev_err(&hdev->pdev->dev,
1374 "Failed to initialize hl%d. Device is NOT usable !\n",
1375 hdev->id / 2);
1376 else
1377 pr_err("Failed to initialize hl%d. Device is NOT usable !\n",
1378 hdev->id / 2);
1379
1380 return rc;
1381}
1382
1383
1384
1385
1386
1387
1388
1389
1390void hl_device_fini(struct hl_device *hdev)
1391{
1392 int i, rc;
1393 ktime_t timeout;
1394
1395 dev_info(hdev->dev, "Removing device\n");
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405 timeout = ktime_add_us(ktime_get(),
1406 HL_HARD_RESET_MAX_TIMEOUT * 1000 * 1000);
1407 rc = atomic_cmpxchg(&hdev->in_reset, 0, 1);
1408 while (rc) {
1409 usleep_range(50, 200);
1410 rc = atomic_cmpxchg(&hdev->in_reset, 0, 1);
1411 if (ktime_compare(ktime_get(), timeout) > 0) {
1412 WARN(1, "Failed to remove device because reset function did not finish\n");
1413 return;
1414 }
1415 }
1416
1417
1418 hdev->disabled = true;
1419
1420
1421
1422
1423 hdev->asic_funcs->hw_queues_lock(hdev);
1424 hdev->asic_funcs->hw_queues_unlock(hdev);
1425
1426
1427 mutex_lock(&hdev->fpriv_list_lock);
1428 mutex_unlock(&hdev->fpriv_list_lock);
1429
1430 hdev->hard_reset_pending = true;
1431
1432 hl_hwmon_fini(hdev);
1433
1434 device_late_fini(hdev);
1435
1436 hl_debugfs_remove_device(hdev);
1437
1438
1439
1440
1441
1442
1443 hdev->asic_funcs->halt_engines(hdev, true);
1444
1445
1446 hl_cs_rollback_all(hdev);
1447
1448
1449
1450
1451
1452 rc = device_kill_open_processes(hdev);
1453 if (rc)
1454 dev_crit(hdev->dev, "Failed to kill all open processes\n");
1455
1456 hl_cb_pool_fini(hdev);
1457
1458
1459 if ((hdev->kernel_ctx) && (hl_ctx_put(hdev->kernel_ctx) != 1))
1460 dev_err(hdev->dev, "kernel ctx is still alive\n");
1461
1462
1463 hdev->asic_funcs->hw_fini(hdev, true);
1464
1465 hl_vm_fini(hdev);
1466
1467 hl_mmu_fini(hdev);
1468
1469 hl_eq_fini(hdev, &hdev->event_queue);
1470
1471 for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
1472 hl_cq_fini(hdev, &hdev->completion_queue[i]);
1473 kfree(hdev->completion_queue);
1474
1475 hl_hw_queues_destroy(hdev);
1476
1477
1478 hdev->asic_funcs->sw_fini(hdev);
1479
1480 device_early_fini(hdev);
1481
1482
1483 device_cdev_sysfs_del(hdev);
1484
1485 pr_info("removed device successfully\n");
1486}
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501inline u32 hl_rreg(struct hl_device *hdev, u32 reg)
1502{
1503 return readl(hdev->rmmio + reg);
1504}
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516inline void hl_wreg(struct hl_device *hdev, u32 reg, u32 val)
1517{
1518 writel(val, hdev->rmmio + reg);
1519}
1520