Discussion:
panic possibly on on bridge member removal
(too old to reply)
Kim Culhan
2012-09-29 12:40:03 UTC
Permalink
After a few hours of operation involving tap0 added to the bridge
running openvpn
and shutting down openvpn which removes tap0 from the bridge, the
machine is found to have a panic:

Fatal trap 12: page fault while in kernel mode
cpuid = 1; apic id = 01
fault virtual address = 0x188
fault code = supervisor read data, page not present
instruction pointer = 0x20:0xffffffff82a14f96
stack pointer = 0x28:0xffffff8000285670
frame pointer = 0x28:0xffffff80002856b0
code segment = base 0x0, limit 0xfffff, type 0x1b
= DPL 0, pres 1, long 1, def32 0, gran 1
processor eflags = interrupt enabled, resume, IOPL = 0
current process = 12 (swi5: fast taskq)
[ thread pid 12 tid 100022 ]
Stopped at bridge_enqueue+0x86: calll *0x188(%r12)
db> bt
Tracing pid 12 tid 100022 td 0xfffffe0003aff000
bridge_enqueue() at bridge_enqueue+0x86
ether_output() at ether_output+0x580
ip_output() at ip_output+0xb2e
ip_forward() at ip_forward+0x1e5
ip_input() at ip_input+0x54e
netisr_dispatch_src() at netisr_dispatch_src+0x7b
ether_demux() at ether_demux+0x156
ether_nh_input() at ether_nh_input+0x31a
netisr_dispatch_src() at netisr_dispatch_src+0x7b
re_rxeof() at re_rxeof+0x515
re_int_task() at re_int_task+0x8b
taskqueue_run_locked() at taskqueue_run_locked+0xf0
taskqueue_run() at taskqueue_run+0x4a
intr_event_execute_handlers() at intr_event_execute_handlers+0xaa
ithread_loop() at ithread_loop+0xde
fork_exit() at fork_exit+0x80
fork_trampoline() at fork_trampoline+0xe
--- trap 0, rip = 0, rsp = 0xffffff8000285cb0, rbp = 0 ---

Trying to get a dump results in a hang:

db> call doadump
Dumping 762 out of 4055 MB: (hang)

thanks
-kim
John Baldwin
2012-10-01 12:04:08 UTC
Permalink
Post by Kim Culhan
After a few hours of operation involving tap0 added to the bridge
running openvpn
and shutting down openvpn which removes tap0 from the bridge, the
Fatal trap 12: page fault while in kernel mode
cpuid = 1; apic id = 01
fault virtual address = 0x188
fault code = supervisor read data, page not present
instruction pointer = 0x20:0xffffffff82a14f96
stack pointer = 0x28:0xffffff8000285670
frame pointer = 0x28:0xffffff80002856b0
code segment = base 0x0, limit 0xfffff, type 0x1b
= DPL 0, pres 1, long 1, def32 0, gran 1
processor eflags = interrupt enabled, resume, IOPL = 0
current process = 12 (swi5: fast taskq)
[ thread pid 12 tid 100022 ]
Stopped at bridge_enqueue+0x86: calll *0x188(%r12)
db> bt
Tracing pid 12 tid 100022 td 0xfffffe0003aff000
bridge_enqueue() at bridge_enqueue+0x86
Can you run 'gdb /boot/kernel/kernel' and do 'l *bridge_enqueue+0x86'?
--
John Baldwin
Kim Culhan
2012-10-01 13:08:30 UTC
Permalink
Post by John Baldwin
Post by Kim Culhan
After a few hours of operation involving tap0 added to the bridge
running openvpn
and shutting down openvpn which removes tap0 from the bridge, the
Fatal trap 12: page fault while in kernel mode
cpuid = 1; apic id = 01
fault virtual address = 0x188
fault code = supervisor read data, page not present
instruction pointer = 0x20:0xffffffff82a14f96
stack pointer = 0x28:0xffffff8000285670
frame pointer = 0x28:0xffffff80002856b0
code segment = base 0x0, limit 0xfffff, type 0x1b
= DPL 0, pres 1, long 1, def32 0, gran 1
processor eflags = interrupt enabled, resume, IOPL = 0
current process = 12 (swi5: fast taskq)
[ thread pid 12 tid 100022 ]
Stopped at bridge_enqueue+0x86: calll *0x188(%r12)
db> bt
Tracing pid 12 tid 100022 td 0xfffffe0003aff000
bridge_enqueue() at bridge_enqueue+0x86
Can you run 'gdb /boot/kernel/kernel' and do 'l *bridge_enqueue+0x86'?
--
John Baldwin
(gdb) l *bridge_enqueue+0x86
No symbol "bridge_enqueue" in current context.
(gdb)

--
-kim
Kim Culhan
2012-10-01 15:05:30 UTC
Permalink
Post by Kim Culhan
Post by John Baldwin
Post by Kim Culhan
After a few hours of operation involving tap0 added to the bridge
running openvpn
and shutting down openvpn which removes tap0 from the bridge, the
Fatal trap 12: page fault while in kernel mode
cpuid = 1; apic id = 01
fault virtual address = 0x188
fault code = supervisor read data, page not present
instruction pointer = 0x20:0xffffffff82a14f96
stack pointer = 0x28:0xffffff8000285670
frame pointer = 0x28:0xffffff80002856b0
code segment = base 0x0, limit 0xfffff, type 0x1b
= DPL 0, pres 1, long 1, def32 0, gran 1
processor eflags = interrupt enabled, resume, IOPL = 0
current process = 12 (swi5: fast taskq)
[ thread pid 12 tid 100022 ]
Stopped at bridge_enqueue+0x86: calll *0x188(%r12)
db> bt
Tracing pid 12 tid 100022 td 0xfffffe0003aff000
bridge_enqueue() at bridge_enqueue+0x86
Can you run 'gdb /boot/kernel/kernel' and do 'l *bridge_enqueue+0x86'?
--
John Baldwin
(gdb) l *bridge_enqueue+0x86
No symbol "bridge_enqueue" in current context.
(gdb)
Oh, are you using if_bridge.ko as a module? If so, you can try running 'gdb
/boot/kernel/if_bridge.ko' instead.
(gdb) l *bridge_enqueue+0x86
0x2f96 is in bridge_enqueue
(/usr/src/sys/modules/if_bridge/../../net/if_bridge.c:1810).
1805 continue;
1806 }
1807 m->m_flags &= ~M_VLANTAG;
1808 }
1809
1810 if ((err = dst_ifp->if_transmit(dst_ifp, m))) {
1811 m_freem(m0);
1812 break;
1813 }
1814 }
(gdb)

--
-lo,
Adrian Chadd
2012-10-01 16:41:07 UTC
Permalink
Sounds like dst_ifp is NULL at that point?



Adrian
John Baldwin
2012-10-01 18:29:31 UTC
Permalink
Post by Kim Culhan
Post by Kim Culhan
Post by John Baldwin
Post by Kim Culhan
After a few hours of operation involving tap0 added to the bridge
running openvpn
and shutting down openvpn which removes tap0 from the bridge, the
Fatal trap 12: page fault while in kernel mode
cpuid = 1; apic id = 01
fault virtual address = 0x188
fault code = supervisor read data, page not present
instruction pointer = 0x20:0xffffffff82a14f96
stack pointer = 0x28:0xffffff8000285670
frame pointer = 0x28:0xffffff80002856b0
code segment = base 0x0, limit 0xfffff, type 0x1b
= DPL 0, pres 1, long 1, def32 0, gran 1
processor eflags = interrupt enabled, resume, IOPL = 0
current process = 12 (swi5: fast taskq)
[ thread pid 12 tid 100022 ]
Stopped at bridge_enqueue+0x86: calll *0x188(%r12)
db> bt
Tracing pid 12 tid 100022 td 0xfffffe0003aff000
bridge_enqueue() at bridge_enqueue+0x86
Can you run 'gdb /boot/kernel/kernel' and do 'l *bridge_enqueue+0x86'?
--
John Baldwin
(gdb) l *bridge_enqueue+0x86
No symbol "bridge_enqueue" in current context.
(gdb)
Oh, are you using if_bridge.ko as a module? If so, you can try running 'gdb
/boot/kernel/if_bridge.ko' instead.
(gdb) l *bridge_enqueue+0x86
0x2f96 is in bridge_enqueue
(/usr/src/sys/modules/if_bridge/../../net/if_bridge.c:1810).
1805 continue;
1806 }
1807 m->m_flags &= ~M_VLANTAG;
1808 }
1809
1810 if ((err = dst_ifp->if_transmit(dst_ifp, m))) {
1811 m_freem(m0);
1812 break;
1813 }
1814 }
(gdb)
Hmm, try this perhaps. I think there is a race where the departing bridge
member can be destroyed before the bridge code is finishing using it. The
way the bridge driver is supposed to prevent that is by ensuring that
bridge_ifdetach() doesn't return until all other references to the detaching
interface are known to be released in the bridge code. I think what is
happening for you is that a bridge_enqueue() performed outside the lock is
running into dst_ifp that has been if_free()'d and had its if_transmit set
to zero when it was reallocated for something else.

Index: if_bridge.c
===================================================================
--- if_bridge.c (revision 241096)
+++ if_bridge.c (working copy)
@@ -1833,6 +1833,7 @@ static void
bridge_dummynet(struct mbuf *m, struct ifnet *ifp)
{
struct bridge_softc *sc;
+ int error = 0;

sc = ifp->if_bridge;

@@ -1846,18 +1847,30 @@ bridge_dummynet(struct mbuf *m, struct ifnet *ifp)
return;
}

+ BRIDGE_LOCK(sc);
+ BRIDGE_LOCK2REF(sc, error);
+ if (error) {
+ m_freem(m);
+ return;
+ }
+
if (PFIL_HOOKED(&V_inet_pfil_hook)
#ifdef INET6
|| PFIL_HOOKED(&V_inet6_pfil_hook)
#endif
) {
- if (bridge_pfil(&m, sc->sc_ifp, ifp, PFIL_OUT) != 0)
+ if (bridge_pfil(&m, sc->sc_ifp, ifp, PFIL_OUT) != 0) {
+ BRIDGE_UNREF(sc);
return;
- if (m == NULL)
+ }
+ if (m == NULL) {
+ BRIDGE_UNREF(sc);
return;
+ }
}

bridge_enqueue(sc, ifp, m);
+ BRIDGE_UNREF(sc);
}

/*
@@ -1971,8 +1984,13 @@ sendunicast:
return (0);
}

- BRIDGE_UNLOCK(sc);
+ BRIDGE_LOCK2REF(sc, error);
+ if (error) {
+ m_freem(m);
+ return (0);
+ }
bridge_enqueue(sc, dst_if, m);
+ BRIDGE_UNREF(sc);
return (0);
}

@@ -2000,8 +2018,13 @@ bridge_transmit(struct ifnet *ifp, struct mbuf *m)

eh = mtod(m, struct ether_header *);
dst_if = bridge_rtlookup(sc, eh->ether_dhost, 1);
- BRIDGE_UNLOCK(sc);
- error = bridge_enqueue(sc, dst_if, m);
+ BRIDGE_LOCK2REF(sc, error);
+ if (error)
+ m_freem(m);
+ else {
+ error = bridge_enqueue(sc, dst_if, m);
+ BRIDGE_UNREF(sc);
+ }
} else
bridge_broadcast(sc, ifp, m, 0);

@@ -2145,20 +2168,29 @@ bridge_forward(struct bridge_softc *sc, struct bri
dbif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING)
goto drop;

- BRIDGE_UNLOCK(sc);
+ BRIDGE_LOCK2REF(sc, error);
+ if (error) {
+ m_freem(m);
+ return;
+ }

if (PFIL_HOOKED(&V_inet_pfil_hook)
#ifdef INET6
|| PFIL_HOOKED(&V_inet6_pfil_hook)
#endif
) {
- if (bridge_pfil(&m, ifp, dst_if, PFIL_OUT) != 0)
+ if (bridge_pfil(&m, ifp, dst_if, PFIL_OUT) != 0) {
+ BRIDGE_UNREF(sc);
return;
- if (m == NULL)
+ }
+ if (m == NULL) {
+ BRIDGE_UNREF(sc);
return;
+ }
}

bridge_enqueue(sc, dst_if, m);
+ BRIDGE_UNREF(sc);
return;

drop:
--
John Baldwin
Loading...